granicus.if.org Git - postgresql/blob - src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlog.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <signal.h>
  19 #include <time.h>
  20 #include <fcntl.h>
  21 #include <sys/stat.h>
  22 #include <sys/time.h>
  23 #include <sys/wait.h>
  24 #include <unistd.h>
  25
  26 #include "access/clog.h"
  27 #include "access/multixact.h"
  28 #include "access/subtrans.h"
  29 #include "access/transam.h"
  30 #include "access/tuptoaster.h"
  31 #include "access/twophase.h"
  32 #include "access/xact.h"
  33 #include "access/xlog_internal.h"
  34 #include "access/xlogutils.h"
  35 #include "catalog/catversion.h"
  36 #include "catalog/pg_control.h"
  37 #include "catalog/pg_database.h"
  38 #include "libpq/pqsignal.h"
  39 #include "miscadmin.h"
  40 #include "pgstat.h"
  41 #include "postmaster/bgwriter.h"
  42 #include "postmaster/startup.h"
  43 #include "replication/walreceiver.h"
  44 #include "replication/walsender.h"
  45 #include "storage/bufmgr.h"
  46 #include "storage/fd.h"
  47 #include "storage/ipc.h"
  48 #include "storage/latch.h"
  49 #include "storage/pmsignal.h"
  50 #include "storage/predicate.h"
  51 #include "storage/proc.h"
  52 #include "storage/procarray.h"
  53 #include "storage/reinit.h"
  54 #include "storage/smgr.h"
  55 #include "storage/spin.h"
  56 #include "utils/builtins.h"
  57 #include "utils/guc.h"
  58 #include "utils/ps_status.h"
  59 #include "utils/relmapper.h"
  60 #include "utils/snapmgr.h"
  61 #include "utils/timestamp.h"
  62 #include "pg_trace.h"
  63
  64
  65 /* File path names (all relative to $PGDATA) */
  66 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  67 #define RECOVERY_COMMAND_DONE   "recovery.done"
  68 #define PROMOTE_SIGNAL_FILE "promote"
  69
  70
  71 /* User-settable parameters */
  72 int                     CheckPointSegments = 3;
  73 int                     wal_keep_segments = 0;
  74 int                     XLOGbuffers = -1;
  75 int                     XLogArchiveTimeout = 0;
  76 bool            XLogArchiveMode = false;
  77 char       *XLogArchiveCommand = NULL;
  78 bool            EnableHotStandby = false;
  79 bool            fullPageWrites = true;
  80 bool            log_checkpoints = false;
  81 int                     sync_method = DEFAULT_SYNC_METHOD;
  82 int                     wal_level = WAL_LEVEL_MINIMAL;
  83
  84 #ifdef WAL_DEBUG
  85 bool            XLOG_DEBUG = false;
  86 #endif
  87
  88 /*
  89  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  90  * When we are done with an old XLOG segment file, we will recycle it as a
  91  * future XLOG segment as long as there aren't already XLOGfileslop future
  92  * segments; else we'll delete it.  This could be made a separate GUC
  93  * variable, but at present I think it's sufficient to hardwire it as
  94  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
  95  * no more than 2*CheckPointSegments log segments, and we want to recycle all
  96  * of them; the +1 allows boundary cases to happen without wasting a
  97  * delete/create-segment cycle.
  98  */
  99 #define XLOGfileslop    (2*CheckPointSegments + 1)
 100
 101 /*
 102  * GUC support
 103  */
 104 const struct config_enum_entry wal_level_options[] = {
 105         {"minimal", WAL_LEVEL_MINIMAL, false},
 106         {"archive", WAL_LEVEL_ARCHIVE, false},
 107         {"hot_standby", WAL_LEVEL_HOT_STANDBY, false},
 108         {NULL, 0, false}
 109 };
 110
 111 const struct config_enum_entry sync_method_options[] = {
 112         {"fsync", SYNC_METHOD_FSYNC, false},
 113 #ifdef HAVE_FSYNC_WRITETHROUGH
 114         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 115 #endif
 116 #ifdef HAVE_FDATASYNC
 117         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 118 #endif
 119 #ifdef OPEN_SYNC_FLAG
 120         {"open_sync", SYNC_METHOD_OPEN, false},
 121 #endif
 122 #ifdef OPEN_DATASYNC_FLAG
 123         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 124 #endif
 125         {NULL, 0, false}
 126 };
 127
 128 /*
 129  * Statistics for current checkpoint are collected in this global struct.
 130  * Because only the background writer or a stand-alone backend can perform
 131  * checkpoints, this will be unused in normal backends.
 132  */
 133 CheckpointStatsData CheckpointStats;
 134
 135 /*
 136  * ThisTimeLineID will be same in all backends --- it identifies current
 137  * WAL timeline for the database system.
 138  */
 139 TimeLineID      ThisTimeLineID = 0;
 140
 141 /*
 142  * Are we doing recovery from XLOG?
 143  *
 144  * This is only ever true in the startup process; it should be read as meaning
 145  * "this process is replaying WAL records", rather than "the system is in
 146  * recovery mode".  It should be examined primarily by functions that need
 147  * to act differently when called from a WAL redo function (e.g., to skip WAL
 148  * logging).  To check whether the system is in recovery regardless of which
 149  * process you're running in, use RecoveryInProgress() but only after shared
 150  * memory startup and lock initialization.
 151  */
 152 bool            InRecovery = false;
 153
 154 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 155 HotStandbyState standbyState = STANDBY_DISABLED;
 156
 157 static XLogRecPtr LastRec;
 158
 159 /*
 160  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 161  * the replayed WAL records indicate. It's initialized with full_page_writes
 162  * that the recovery starting checkpoint record indicates, and then updated
 163  * each time XLOG_FPW_CHANGE record is replayed.
 164  */
 165 static bool lastFullPageWrites;
 166
 167 /*
 168  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 169  * known, need to check the shared state".
 170  */
 171 static bool LocalRecoveryInProgress = true;
 172
 173 /*
 174  * Local copy of SharedHotStandbyActive variable. False actually means "not
 175  * known, need to check the shared state".
 176  */
 177 static bool LocalHotStandbyActive = false;
 178
 179 /*
 180  * Local state for XLogInsertAllowed():
 181  *              1: unconditionally allowed to insert XLOG
 182  *              0: unconditionally not allowed to insert XLOG
 183  *              -1: must check RecoveryInProgress(); disallow until it is false
 184  * Most processes start with -1 and transition to 1 after seeing that recovery
 185  * is not in progress.  But we can also force the value for special cases.
 186  * The coding in XLogInsertAllowed() depends on the first two of these states
 187  * being numerically the same as bool true and false.
 188  */
 189 static int      LocalXLogInsertAllowed = -1;
 190
 191 /* Are we recovering using offline XLOG archives? */
 192 static bool InArchiveRecovery = false;
 193
 194 /* Was the last xlog file restored from archive, or local? */
 195 static bool restoredFromArchive = false;
 196
 197 /* options taken from recovery.conf for archive recovery */
 198 static char *recoveryRestoreCommand = NULL;
 199 static char *recoveryEndCommand = NULL;
 200 static char *archiveCleanupCommand = NULL;
 201 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 202 static bool recoveryTargetInclusive = true;
 203 static bool recoveryPauseAtTarget = true;
 204 static TransactionId recoveryTargetXid;
 205 static TimestampTz recoveryTargetTime;
 206 static char *recoveryTargetName;
 207
 208 /* options taken from recovery.conf for XLOG streaming */
 209 static bool StandbyMode = false;
 210 static char *PrimaryConnInfo = NULL;
 211 static char *TriggerFile = NULL;
 212
 213 /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
 214 static TransactionId recoveryStopXid;
 215 static TimestampTz recoveryStopTime;
 216 static char recoveryStopName[MAXFNAMELEN];
 217 static bool recoveryStopAfter;
 218
 219 /*
 220  * During normal operation, the only timeline we care about is ThisTimeLineID.
 221  * During recovery, however, things are more complicated.  To simplify life
 222  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 223  * scan through the WAL history (that is, it is the line that was active when
 224  * the currently-scanned WAL record was generated).  We also need these
 225  * timeline values:
 226  *
 227  * recoveryTargetTLI: the desired timeline that we want to end in.
 228  *
 229  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 230  *
 231  * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 232  * its known parents, newest first (so recoveryTargetTLI is always the
 233  * first list member).  Only these TLIs are expected to be seen in the WAL
 234  * segments we read, and indeed only these TLIs will be considered as
 235  * candidate WAL files to open at all.
 236  *
 237  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 238  * (This is not necessarily the same as ThisTimeLineID, because we could
 239  * be scanning data that was copied from an ancestor timeline when the current
 240  * file was created.)  During a sequential scan we do not allow this value
 241  * to decrease.
 242  */
 243 static TimeLineID recoveryTargetTLI;
 244 static bool recoveryTargetIsLatest = false;
 245 static List *expectedTLIs;
 246 static TimeLineID curFileTLI;
 247
 248 /*
 249  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 250  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 251  * end+1 of the last record, and is reset when we end a top-level transaction,
 252  * or start a new one; so it can be used to tell if the current transaction has
 253  * created any XLOG records.
 254  */
 255 static XLogRecPtr ProcLastRecPtr = {0, 0};
 256
 257 XLogRecPtr      XactLastRecEnd = {0, 0};
 258
 259 /*
 260  * RedoRecPtr is this backend's local copy of the REDO record pointer
 261  * (which is almost but not quite the same as a pointer to the most recent
 262  * CHECKPOINT record).  We update this from the shared-memory copy,
 263  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 264  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
 265  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 266  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 267  * InitXLOGAccess.
 268  */
 269 static XLogRecPtr RedoRecPtr;
 270
 271 /*
 272  * RedoStartLSN points to the checkpoint's REDO location which is specified
 273  * in a backup label file, backup history file or control file. In standby
 274  * mode, XLOG streaming usually starts from the position where an invalid
 275  * record was found. But if we fail to read even the initial checkpoint
 276  * record, we use the REDO location instead of the checkpoint location as
 277  * the start position of XLOG streaming. Otherwise we would have to jump
 278  * backwards to the REDO location after reading the checkpoint record,
 279  * because the REDO record can precede the checkpoint record.
 280  */
 281 static XLogRecPtr RedoStartLSN = {0, 0};
 282
 283 /*----------
 284  * Shared-memory data structures for XLOG control
 285  *
 286  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 287  * the log up to (all records before that point must be written or fsynced).
 288  * LogwrtResult indicates the byte positions we have already written/fsynced.
 289  * These structs are identical but are declared separately to indicate their
 290  * slightly different functions.
 291  *
 292  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 293  * WALWriteLock.  To update it, you need to hold both locks.  The point of
 294  * this arrangement is that the value can be examined by code that already
 295  * holds WALWriteLock without needing to grab info_lck as well.  In addition
 296  * to the shared variable, each backend has a private copy of LogwrtResult,
 297  * which is updated when convenient.
 298  *
 299  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 300  * (protected by info_lck), but we don't need to cache any copies of it.
 301  *
 302  * info_lck is only held long enough to read/update the protected variables,
 303  * so it's a plain spinlock.  The other locks are held longer (potentially
 304  * over I/O operations), so we use LWLocks for them.  These locks are:
 305  *
 306  * WALInsertLock: must be held to insert a record into the WAL buffers.
 307  *
 308  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 309  * XLogFlush).
 310  *
 311  * ControlFileLock: must be held to read/update control file or create
 312  * new log file.
 313  *
 314  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 315  * only one checkpointer at a time; currently, with all checkpoints done by
 316  * the checkpointer, this is just pro forma).
 317  *
 318  *----------
 319  */
 320
 321 typedef struct XLogwrtRqst
 322 {
 323         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 324         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 325 } XLogwrtRqst;
 326
 327 typedef struct XLogwrtResult
 328 {
 329         XLogRecPtr      Write;                  /* last byte + 1 written out */
 330         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 331 } XLogwrtResult;
 332
 333 /*
 334  * Shared state data for XLogInsert.
 335  */
 336 typedef struct XLogCtlInsert
 337 {
 338         XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
 339         int                     curridx;                /* current block index in cache */
 340         XLogPageHeader currpage;        /* points to header of block in cache */
 341         char       *currpos;            /* current insertion point in cache */
 342         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 343         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 344
 345         /*
 346          * fullPageWrites is the master copy used by all backends to determine
 347          * whether to write full-page to WAL, instead of using process-local one.
 348          * This is required because, when full_page_writes is changed by SIGHUP,
 349          * we must WAL-log it before it actually affects WAL-logging by backends.
 350          * Checkpointer sets at startup or after SIGHUP.
 351          */
 352         bool            fullPageWrites;
 353
 354         /*
 355          * exclusiveBackup is true if a backup started with pg_start_backup() is
 356          * in progress, and nonExclusiveBackups is a counter indicating the number
 357          * of streaming base backups currently in progress. forcePageWrites is set
 358          * to true when either of these is non-zero. lastBackupStart is the latest
 359          * checkpoint redo location used as a starting point for an online backup.
 360          */
 361         bool            exclusiveBackup;
 362         int                     nonExclusiveBackups;
 363         XLogRecPtr      lastBackupStart;
 364 } XLogCtlInsert;
 365
 366 /*
 367  * Shared state data for XLogWrite/XLogFlush.
 368  */
 369 typedef struct XLogCtlWrite
 370 {
 371         int                     curridx;                /* cache index of next block to write */
 372         pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
 373 } XLogCtlWrite;
 374
 375 /*
 376  * Total shared-memory state for XLOG.
 377  */
 378 typedef struct XLogCtlData
 379 {
 380         /* Protected by WALInsertLock: */
 381         XLogCtlInsert Insert;
 382
 383         /* Protected by info_lck: */
 384         XLogwrtRqst LogwrtRqst;
 385         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 386         TransactionId ckptXid;
 387         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 388         uint32          lastRemovedLog; /* latest removed/recycled XLOG segment */
 389         uint32          lastRemovedSeg;
 390
 391         /* Protected by WALWriteLock: */
 392         XLogCtlWrite Write;
 393
 394         /*
 395          * Protected by info_lck and WALWriteLock (you must hold either lock to
 396          * read it, but both to update)
 397          */
 398         XLogwrtResult LogwrtResult;
 399
 400         /*
 401          * These values do not change after startup, although the pointed-to pages
 402          * and xlblocks values certainly do.  Permission to read/write the pages
 403          * and xlblocks values depends on WALInsertLock and WALWriteLock.
 404          */
 405         char       *pages;                      /* buffers for unwritten XLOG pages */
 406         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 407         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 408         TimeLineID      ThisTimeLineID;
 409         TimeLineID      RecoveryTargetTLI;
 410
 411         /*
 412          * archiveCleanupCommand is read from recovery.conf but needs to be in
 413          * shared memory so that the checkpointer process can access it.
 414          */
 415         char            archiveCleanupCommand[MAXPGPATH];
 416
 417         /*
 418          * SharedRecoveryInProgress indicates if we're still in crash or archive
 419          * recovery.  Protected by info_lck.
 420          */
 421         bool            SharedRecoveryInProgress;
 422
 423         /*
 424          * SharedHotStandbyActive indicates if we're still in crash or archive
 425          * recovery.  Protected by info_lck.
 426          */
 427         bool            SharedHotStandbyActive;
 428
 429         /*
 430          * WalWriterSleeping indicates whether the WAL writer is currently in
 431          * low-power mode (and hence should be nudged if an async commit occurs).
 432          * Protected by info_lck.
 433          */
 434         bool            WalWriterSleeping;
 435
 436         /*
 437          * recoveryWakeupLatch is used to wake up the startup process to continue
 438          * WAL replay, if it is waiting for WAL to arrive or failover trigger file
 439          * to appear.
 440          */
 441         Latch           recoveryWakeupLatch;
 442
 443         /*
 444          * During recovery, we keep a copy of the latest checkpoint record here.
 445          * Used by the background writer when it wants to create a restartpoint.
 446          *
 447          * Protected by info_lck.
 448          */
 449         XLogRecPtr      lastCheckPointRecPtr;
 450         CheckPoint      lastCheckPoint;
 451
 452         /* end+1 of the last record replayed (or being replayed) */
 453         XLogRecPtr      replayEndRecPtr;
 454         /* end+1 of the last record replayed */
 455         XLogRecPtr      recoveryLastRecPtr;
 456         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 457         TimestampTz recoveryLastXTime;
 458
 459         /*
 460          * timestamp of when we started replaying the current chunk of WAL data,
 461          * only relevant for replication or archive recovery
 462          */
 463         TimestampTz currentChunkStartTime;
 464         /* end of the last record restored from the archive */
 465         XLogRecPtr      restoreLastRecPtr;
 466         /* Are we requested to pause recovery? */
 467         bool            recoveryPause;
 468
 469         /*
 470          * lastFpwDisableRecPtr points to the start of the last replayed
 471          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 472          */
 473         XLogRecPtr      lastFpwDisableRecPtr;
 474
 475         slock_t         info_lck;               /* locks shared variables shown above */
 476 } XLogCtlData;
 477
 478 static XLogCtlData *XLogCtl = NULL;
 479
 480 /*
 481  * We maintain an image of pg_control in shared memory.
 482  */
 483 static ControlFileData *ControlFile = NULL;
 484
 485 /*
 486  * Macros for managing XLogInsert state.  In most cases, the calling routine
 487  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 488  * so these are passed as parameters instead of being fetched via XLogCtl.
 489  */
 490
 491 /* Free space remaining in the current xlog page buffer */
 492 #define INSERT_FREESPACE(Insert)  \
 493         (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
 494
 495 /* Construct XLogRecPtr value for current insertion point */
 496 #define INSERT_RECPTR(recptr,Insert,curridx)  \
 497         ( \
 498           (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
 499           (recptr).xrecoff = \
 500                 XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
 501         )
 502
 503 #define PrevBufIdx(idx)         \
 504                 (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
 505
 506 #define NextBufIdx(idx)         \
 507                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 508
 509 /*
 510  * Private, possibly out-of-date copy of shared LogwrtResult.
 511  * See discussion above.
 512  */
 513 static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
 514
 515 /*
 516  * Codes indicating where we got a WAL file from during recovery, or where
 517  * to attempt to get one.  These are chosen so that they can be OR'd together
 518  * in a bitmask state variable.
 519  */
 520 #define XLOG_FROM_ARCHIVE               (1<<0)  /* Restored using restore_command */
 521 #define XLOG_FROM_PG_XLOG               (1<<1)  /* Existing file in pg_xlog */
 522 #define XLOG_FROM_STREAM                (1<<2)  /* Streamed from master */
 523
 524 /*
 525  * openLogFile is -1 or a kernel FD for an open log file segment.
 526  * When it's open, openLogOff is the current seek offset in the file.
 527  * openLogId/openLogSeg identify the segment.  These variables are only
 528  * used to write the XLOG, and so will normally refer to the active segment.
 529  */
 530 static int      openLogFile = -1;
 531 static uint32 openLogId = 0;
 532 static uint32 openLogSeg = 0;
 533 static uint32 openLogOff = 0;
 534
 535 /*
 536  * These variables are used similarly to the ones above, but for reading
 537  * the XLOG.  Note, however, that readOff generally represents the offset
 538  * of the page just read, not the seek position of the FD itself, which
 539  * will be just past that page. readLen indicates how much of the current
 540  * page has been read into readBuf, and readSource indicates where we got
 541  * the currently open file from.
 542  */
 543 static int      readFile = -1;
 544 static uint32 readId = 0;
 545 static uint32 readSeg = 0;
 546 static uint32 readOff = 0;
 547 static uint32 readLen = 0;
 548 static int      readSource = 0;         /* XLOG_FROM_* code */
 549
 550 /*
 551  * Keeps track of which sources we've tried to read the current WAL
 552  * record from and failed.
 553  */
 554 static int      failedSources = 0;      /* OR of XLOG_FROM_* codes */
 555
 556 /*
 557  * These variables track when we last obtained some WAL data to process,
 558  * and where we got it from.  (XLogReceiptSource is initially the same as
 559  * readSource, but readSource gets reset to zero when we don't have data
 560  * to process right now.)
 561  */
 562 static TimestampTz XLogReceiptTime = 0;
 563 static int      XLogReceiptSource = 0;          /* XLOG_FROM_* code */
 564
 565 /* Buffer for currently read page (XLOG_BLCKSZ bytes) */
 566 static char *readBuf = NULL;
 567
 568 /* Buffer for current ReadRecord result (expandable) */
 569 static char *readRecordBuf = NULL;
 570 static uint32 readRecordBufSize = 0;
 571
 572 /* State information for XLOG reading */
 573 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 574 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 575 static TimeLineID lastPageTLI = 0;
 576
 577 static XLogRecPtr minRecoveryPoint;             /* local copy of
 578                                                                                  * ControlFile->minRecoveryPoint */
 579 static bool updateMinRecoveryPoint = true;
 580
 581 /*
 582  * Have we reached a consistent database state? In crash recovery, we have
 583  * to replay all the WAL, so reachedConsistency is never set. During archive
 584  * recovery, the database is consistent once minRecoveryPoint is reached.
 585  */
 586 bool            reachedConsistency = false;
 587
 588 static bool InRedo = false;
 589
 590 /* Have we launched bgwriter during recovery? */
 591 static bool bgwriterLaunched = false;
 592
 593 /*
 594  * Information logged when we detect a change in one of the parameters
 595  * important for Hot Standby.
 596  */
 597 typedef struct xl_parameter_change
 598 {
 599         int                     MaxConnections;
 600         int                     max_prepared_xacts;
 601         int                     max_locks_per_xact;
 602         int                     wal_level;
 603 } xl_parameter_change;
 604
 605 /* logs restore point */
 606 typedef struct xl_restore_point
 607 {
 608         TimestampTz rp_time;
 609         char            rp_name[MAXFNAMELEN];
 610 } xl_restore_point;
 611
 612
 613 static void XLogArchiveNotify(const char *xlog);
 614 static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
 615 static bool XLogArchiveCheckDone(const char *xlog);
 616 static bool XLogArchiveIsBusy(const char *xlog);
 617 static void XLogArchiveCleanup(const char *xlog);
 618 static void readRecoveryCommandFile(void);
 619 static void exitArchiveRecovery(TimeLineID endTLI,
 620                                         uint32 endLogId, uint32 endLogSeg);
 621 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 622 static void recoveryPausesHere(void);
 623 static void SetLatestXTime(TimestampTz xtime);
 624 static void SetCurrentChunkStartTime(TimestampTz xtime);
 625 static void CheckRequiredParameterValues(void);
 626 static void XLogReportParameters(void);
 627 static void LocalSetXLogInsertAllowed(void);
 628 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 629 static void KeepLogSeg(XLogRecPtr recptr, uint32 *logId, uint32 *logSeg);
 630
 631 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
 632                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 633 static bool AdvanceXLInsertBuffer(bool new_segment);
 634 static bool XLogCheckpointNeeded(uint32 logid, uint32 logseg);
 635 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
 636 static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
 637                                            bool find_free, int *max_advance,
 638                                            bool use_lock);
 639 static int XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
 640                          int source, bool notexistOk);
 641 static int XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode,
 642                                    int sources);
 643 static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
 644                          bool randAccess);
 645 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 646 static void XLogFileClose(void);
 647 static bool RestoreArchivedFile(char *path, const char *xlogfname,
 648                                         const char *recovername, off_t expectedSize);
 649 static void ExecuteRecoveryCommand(char *command, char *commandName,
 650                                            bool failOnerror);
 651 static void PreallocXlogFiles(XLogRecPtr endptr);
 652 static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
 653 static void UpdateLastRemovedPtr(char *filename);
 654 static void ValidateXLOGDirectoryStructure(void);
 655 static void CleanupBackupHistory(void);
 656 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 657 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
 658 static void CheckRecoveryConsistency(void);
 659 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 660 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 661 static List *readTimeLineHistory(TimeLineID targetTLI);
 662 static bool existsTimeLineHistory(TimeLineID probeTLI);
 663 static bool rescanLatestTimeLine(void);
 664 static TimeLineID findNewestTimeLine(TimeLineID startTLI);
 665 static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 666                                          TimeLineID endTLI,
 667                                          uint32 endLogId, uint32 endLogSeg);
 668 static void WriteControlFile(void);
 669 static void ReadControlFile(void);
 670 static char *str_time(pg_time_t tnow);
 671 static bool CheckForStandbyTrigger(void);
 672
 673 #ifdef WAL_DEBUG
 674 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 675 #endif
 676 static void pg_start_backup_callback(int code, Datum arg);
 677 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 678                                   bool *backupEndRequired, bool *backupFromStandby);
 679 static void rm_redo_error_callback(void *arg);
 680 static int      get_sync_bit(int method);
 681
 682
 683 /*
 684  * Insert an XLOG record having the specified RMID and info bytes,
 685  * with the body of the record being the data chunk(s) described by
 686  * the rdata chain (see xlog.h for notes about rdata).
 687  *
 688  * Returns XLOG pointer to end of record (beginning of next record).
 689  * This can be used as LSN for data pages affected by the logged action.
 690  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 691  * before the data page can be written out.  This implements the basic
 692  * WAL rule "write the log before the data".)
 693  *
 694  * NB: this routine feels free to scribble on the XLogRecData structs,
 695  * though not on the data they reference.  This is OK since the XLogRecData
 696  * structs are always just temporaries in the calling code.
 697  */
 698 XLogRecPtr
 699 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 700 {
 701         XLogCtlInsert *Insert = &XLogCtl->Insert;
 702         XLogRecord *record;
 703         XLogContRecord *contrecord;
 704         XLogRecPtr      RecPtr;
 705         XLogRecPtr      WriteRqst;
 706         uint32          freespace;
 707         int                     curridx;
 708         XLogRecData *rdt;
 709         XLogRecData *rdt_lastnormal;
 710         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 711         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 712         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 713         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 714         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 715         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 716         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 717         pg_crc32        rdata_crc;
 718         uint32          len,
 719                                 write_len;
 720         unsigned        i;
 721         bool            updrqst;
 722         bool            doPageWrites;
 723         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 724         uint8           info_orig = info;
 725
 726         /* cross-check on whether we should be here or not */
 727         if (!XLogInsertAllowed())
 728                 elog(ERROR, "cannot make new WAL entries during recovery");
 729
 730         /* info's high bits are reserved for use by me */
 731         if (info & XLR_INFO_MASK)
 732                 elog(PANIC, "invalid xlog info mask %02X", info);
 733
 734         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 735
 736         /*
 737          * In bootstrap mode, we don't actually log anything but XLOG resources;
 738          * return a phony record pointer.
 739          */
 740         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 741         {
 742                 RecPtr.xlogid = 0;
 743                 RecPtr.xrecoff = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 744                 return RecPtr;
 745         }
 746
 747         /*
 748          * Here we scan the rdata chain, to determine which buffers must be backed
 749          * up.
 750          *
 751          * We may have to loop back to here if a race condition is detected below.
 752          * We could prevent the race by doing all this work while holding the
 753          * insert lock, but it seems better to avoid doing CRC calculations while
 754          * holding the lock.
 755          *
 756          * We add entries for backup blocks to the chain, so that they don't need
 757          * any special treatment in the critical section where the chunks are
 758          * copied into the WAL buffers. Those entries have to be unlinked from the
 759          * chain if we have to loop back here.
 760          */
 761 begin:;
 762         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 763         {
 764                 dtbuf[i] = InvalidBuffer;
 765                 dtbuf_bkp[i] = false;
 766         }
 767
 768         /*
 769          * Decide if we need to do full-page writes in this XLOG record: true if
 770          * full_page_writes is on or we have a PITR request for it.  Since we
 771          * don't yet have the insert lock, fullPageWrites and forcePageWrites
 772          * could change under us, but we'll recheck them once we have the lock.
 773          */
 774         doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
 775
 776         len = 0;
 777         for (rdt = rdata;;)
 778         {
 779                 if (rdt->buffer == InvalidBuffer)
 780                 {
 781                         /* Simple data, just include it */
 782                         len += rdt->len;
 783                 }
 784                 else
 785                 {
 786                         /* Find info for buffer */
 787                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 788                         {
 789                                 if (rdt->buffer == dtbuf[i])
 790                                 {
 791                                         /* Buffer already referenced by earlier chain item */
 792                                         if (dtbuf_bkp[i])
 793                                         {
 794                                                 rdt->data = NULL;
 795                                                 rdt->len = 0;
 796                                         }
 797                                         else if (rdt->data)
 798                                                 len += rdt->len;
 799                                         break;
 800                                 }
 801                                 if (dtbuf[i] == InvalidBuffer)
 802                                 {
 803                                         /* OK, put it in this slot */
 804                                         dtbuf[i] = rdt->buffer;
 805                                         if (XLogCheckBuffer(rdt, doPageWrites,
 806                                                                                 &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 807                                         {
 808                                                 dtbuf_bkp[i] = true;
 809                                                 rdt->data = NULL;
 810                                                 rdt->len = 0;
 811                                         }
 812                                         else if (rdt->data)
 813                                                 len += rdt->len;
 814                                         break;
 815                                 }
 816                         }
 817                         if (i >= XLR_MAX_BKP_BLOCKS)
 818                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 819                                          XLR_MAX_BKP_BLOCKS);
 820                 }
 821                 /* Break out of loop when rdt points to last chain item */
 822                 if (rdt->next == NULL)
 823                         break;
 824                 rdt = rdt->next;
 825         }
 826
 827         /*
 828          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 829          * error checking in ReadRecord.  This means that all callers of
 830          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 831          * make an exception for XLOG SWITCH records because we don't want them to
 832          * ever cross a segment boundary.
 833          */
 834         if (len == 0 && !isLogSwitch)
 835                 elog(PANIC, "invalid xlog record length %u", len);
 836
 837         /*
 838          * Make additional rdata chain entries for the backup blocks, so that we
 839          * don't need to special-case them in the write loop.  This modifies the
 840          * original rdata chain, but we keep a pointer to the last regular entry,
 841          * rdt_lastnormal, so that we can undo this if we have to loop back to the
 842          * beginning.
 843          *
 844          * At the exit of this loop, write_len includes the backup block data.
 845          *
 846          * Also set the appropriate info bits to show which buffers were backed
 847          * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
 848          * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
 849          */
 850         rdt_lastnormal = rdt;
 851         write_len = len;
 852         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 853         {
 854                 BkpBlock   *bkpb;
 855                 char       *page;
 856
 857                 if (!dtbuf_bkp[i])
 858                         continue;
 859
 860                 info |= XLR_SET_BKP_BLOCK(i);
 861
 862                 bkpb = &(dtbuf_xlg[i]);
 863                 page = (char *) BufferGetBlock(dtbuf[i]);
 864
 865                 rdt->next = &(dtbuf_rdt1[i]);
 866                 rdt = rdt->next;
 867
 868                 rdt->data = (char *) bkpb;
 869                 rdt->len = sizeof(BkpBlock);
 870                 write_len += sizeof(BkpBlock);
 871
 872                 rdt->next = &(dtbuf_rdt2[i]);
 873                 rdt = rdt->next;
 874
 875                 if (bkpb->hole_length == 0)
 876                 {
 877                         rdt->data = page;
 878                         rdt->len = BLCKSZ;
 879                         write_len += BLCKSZ;
 880                         rdt->next = NULL;
 881                 }
 882                 else
 883                 {
 884                         /* must skip the hole */
 885                         rdt->data = page;
 886                         rdt->len = bkpb->hole_offset;
 887                         write_len += bkpb->hole_offset;
 888
 889                         rdt->next = &(dtbuf_rdt3[i]);
 890                         rdt = rdt->next;
 891
 892                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
 893                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
 894                         write_len += rdt->len;
 895                         rdt->next = NULL;
 896                 }
 897         }
 898
 899         /*
 900          * Calculate CRC of the data, including all the backup blocks
 901          *
 902          * Note that the record header isn't added into the CRC initially since we
 903          * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
 904          * the whole record in the order: rdata, then backup blocks, then record
 905          * header.
 906          */
 907         INIT_CRC32(rdata_crc);
 908         for (rdt = rdata; rdt != NULL; rdt = rdt->next)
 909                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 910
 911         START_CRIT_SECTION();
 912
 913         /* Now wait to get insert lock */
 914         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 915
 916         /*
 917          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
 918          * back and recompute everything.  This can only happen just after a
 919          * checkpoint, so it's better to be slow in this case and fast otherwise.
 920          *
 921          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
 922          * affect the contents of the XLOG record, so we'll update our local copy
 923          * but not force a recomputation.
 924          */
 925         if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
 926         {
 927                 Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
 928                 RedoRecPtr = Insert->RedoRecPtr;
 929
 930                 if (doPageWrites)
 931                 {
 932                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 933                         {
 934                                 if (dtbuf[i] == InvalidBuffer)
 935                                         continue;
 936                                 if (dtbuf_bkp[i] == false &&
 937                                         XLByteLE(dtbuf_lsn[i], RedoRecPtr))
 938                                 {
 939                                         /*
 940                                          * Oops, this buffer now needs to be backed up, but we
 941                                          * didn't think so above.  Start over.
 942                                          */
 943                                         LWLockRelease(WALInsertLock);
 944                                         END_CRIT_SECTION();
 945                                         rdt_lastnormal->next = NULL;
 946                                         info = info_orig;
 947                                         goto begin;
 948                                 }
 949                         }
 950                 }
 951         }
 952
 953         /*
 954          * Also check to see if fullPageWrites or forcePageWrites was just turned
 955          * on; if we weren't already doing full-page writes then go back and
 956          * recompute. (If it was just turned off, we could recompute the record
 957          * without full pages, but we choose not to bother.)
 958          */
 959         if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
 960         {
 961                 /* Oops, must redo it with full-page data. */
 962                 LWLockRelease(WALInsertLock);
 963                 END_CRIT_SECTION();
 964                 rdt_lastnormal->next = NULL;
 965                 info = info_orig;
 966                 goto begin;
 967         }
 968
 969         /*
 970          * If there isn't enough space on the current XLOG page for a record
 971          * header, advance to the next page (leaving the unused space as zeroes).
 972          */
 973         updrqst = false;
 974         freespace = INSERT_FREESPACE(Insert);
 975         if (freespace < SizeOfXLogRecord)
 976         {
 977                 updrqst = AdvanceXLInsertBuffer(false);
 978                 freespace = INSERT_FREESPACE(Insert);
 979         }
 980
 981         /* Compute record's XLOG location */
 982         curridx = Insert->curridx;
 983         INSERT_RECPTR(RecPtr, Insert, curridx);
 984
 985         /*
 986          * If the record is an XLOG_SWITCH, and we are exactly at the start of a
 987          * segment, we need not insert it (and don't want to because we'd like
 988          * consecutive switch requests to be no-ops).  Instead, make sure
 989          * everything is written and flushed through the end of the prior segment,
 990          * and return the prior segment's end address.
 991          */
 992         if (isLogSwitch &&
 993                 (RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
 994         {
 995                 /* We can release insert lock immediately */
 996                 LWLockRelease(WALInsertLock);
 997
 998                 RecPtr.xrecoff -= SizeOfXLogLongPHD;
 999                 if (RecPtr.xrecoff == 0)
1000                 {
1001                         /* crossing a logid boundary */
1002                         RecPtr.xlogid -= 1;
1003                         RecPtr.xrecoff = XLogFileSize;
1004                 }
1005
1006                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1007                 LogwrtResult = XLogCtl->LogwrtResult;
1008                 if (!XLByteLE(RecPtr, LogwrtResult.Flush))
1009                 {
1010                         XLogwrtRqst FlushRqst;
1011
1012                         FlushRqst.Write = RecPtr;
1013                         FlushRqst.Flush = RecPtr;
1014                         XLogWrite(FlushRqst, false, false);
1015                 }
1016                 LWLockRelease(WALWriteLock);
1017
1018                 END_CRIT_SECTION();
1019
1020                 return RecPtr;
1021         }
1022
1023         /* Insert record header */
1024
1025         record = (XLogRecord *) Insert->currpos;
1026         record->xl_prev = Insert->PrevRecord;
1027         record->xl_xid = GetCurrentTransactionIdIfAny();
1028         record->xl_tot_len = SizeOfXLogRecord + write_len;
1029         record->xl_len = len;           /* doesn't include backup blocks */
1030         record->xl_info = info;
1031         record->xl_rmid = rmid;
1032
1033         /* Now we can finish computing the record's CRC */
1034         COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
1035                            SizeOfXLogRecord - sizeof(pg_crc32));
1036         FIN_CRC32(rdata_crc);
1037         record->xl_crc = rdata_crc;
1038
1039 #ifdef WAL_DEBUG
1040         if (XLOG_DEBUG)
1041         {
1042                 StringInfoData buf;
1043
1044                 initStringInfo(&buf);
1045                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1046                                                  RecPtr.xlogid, RecPtr.xrecoff);
1047                 xlog_outrec(&buf, record);
1048                 if (rdata->data != NULL)
1049                 {
1050                         appendStringInfo(&buf, " - ");
1051                         RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
1052                 }
1053                 elog(LOG, "%s", buf.data);
1054                 pfree(buf.data);
1055         }
1056 #endif
1057
1058         /* Record begin of record in appropriate places */
1059         ProcLastRecPtr = RecPtr;
1060         Insert->PrevRecord = RecPtr;
1061
1062         Insert->currpos += SizeOfXLogRecord;
1063         freespace -= SizeOfXLogRecord;
1064
1065         /*
1066          * Append the data, including backup blocks if any
1067          */
1068         while (write_len)
1069         {
1070                 while (rdata->data == NULL)
1071                         rdata = rdata->next;
1072
1073                 if (freespace > 0)
1074                 {
1075                         if (rdata->len > freespace)
1076                         {
1077                                 memcpy(Insert->currpos, rdata->data, freespace);
1078                                 rdata->data += freespace;
1079                                 rdata->len -= freespace;
1080                                 write_len -= freespace;
1081                         }
1082                         else
1083                         {
1084                                 memcpy(Insert->currpos, rdata->data, rdata->len);
1085                                 freespace -= rdata->len;
1086                                 write_len -= rdata->len;
1087                                 Insert->currpos += rdata->len;
1088                                 rdata = rdata->next;
1089                                 continue;
1090                         }
1091                 }
1092
1093                 /* Use next buffer */
1094                 updrqst = AdvanceXLInsertBuffer(false);
1095                 curridx = Insert->curridx;
1096                 /* Insert cont-record header */
1097                 Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1098                 contrecord = (XLogContRecord *) Insert->currpos;
1099                 contrecord->xl_rem_len = write_len;
1100                 Insert->currpos += SizeOfXLogContRecord;
1101                 freespace = INSERT_FREESPACE(Insert);
1102         }
1103
1104         /* Ensure next record will be properly aligned */
1105         Insert->currpos = (char *) Insert->currpage +
1106                 MAXALIGN(Insert->currpos - (char *) Insert->currpage);
1107         freespace = INSERT_FREESPACE(Insert);
1108
1109         /*
1110          * The recptr I return is the beginning of the *next* record. This will be
1111          * stored as LSN for changed data pages...
1112          */
1113         INSERT_RECPTR(RecPtr, Insert, curridx);
1114
1115         /*
1116          * If the record is an XLOG_SWITCH, we must now write and flush all the
1117          * existing data, and then forcibly advance to the start of the next
1118          * segment.  It's not good to do this I/O while holding the insert lock,
1119          * but there seems too much risk of confusion if we try to release the
1120          * lock sooner.  Fortunately xlog switch needn't be a high-performance
1121          * operation anyway...
1122          */
1123         if (isLogSwitch)
1124         {
1125                 XLogwrtRqst FlushRqst;
1126                 XLogRecPtr      OldSegEnd;
1127
1128                 TRACE_POSTGRESQL_XLOG_SWITCH();
1129
1130                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1131
1132                 /*
1133                  * Flush through the end of the page containing XLOG_SWITCH, and
1134                  * perform end-of-segment actions (eg, notifying archiver).
1135                  */
1136                 WriteRqst = XLogCtl->xlblocks[curridx];
1137                 FlushRqst.Write = WriteRqst;
1138                 FlushRqst.Flush = WriteRqst;
1139                 XLogWrite(FlushRqst, false, true);
1140
1141                 /* Set up the next buffer as first page of next segment */
1142                 /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
1143                 (void) AdvanceXLInsertBuffer(true);
1144
1145                 /* There should be no unwritten data */
1146                 curridx = Insert->curridx;
1147                 Assert(curridx == XLogCtl->Write.curridx);
1148
1149                 /* Compute end address of old segment */
1150                 OldSegEnd = XLogCtl->xlblocks[curridx];
1151                 OldSegEnd.xrecoff -= XLOG_BLCKSZ;
1152                 if (OldSegEnd.xrecoff == 0)
1153                 {
1154                         /* crossing a logid boundary */
1155                         OldSegEnd.xlogid -= 1;
1156                         OldSegEnd.xrecoff = XLogFileSize;
1157                 }
1158
1159                 /* Make it look like we've written and synced all of old segment */
1160                 LogwrtResult.Write = OldSegEnd;
1161                 LogwrtResult.Flush = OldSegEnd;
1162
1163                 /*
1164                  * Update shared-memory status --- this code should match XLogWrite
1165                  */
1166                 {
1167                         /* use volatile pointer to prevent code rearrangement */
1168                         volatile XLogCtlData *xlogctl = XLogCtl;
1169
1170                         SpinLockAcquire(&xlogctl->info_lck);
1171                         xlogctl->LogwrtResult = LogwrtResult;
1172                         if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1173                                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1174                         if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1175                                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1176                         SpinLockRelease(&xlogctl->info_lck);
1177                 }
1178
1179                 LWLockRelease(WALWriteLock);
1180
1181                 updrqst = false;                /* done already */
1182         }
1183         else
1184         {
1185                 /* normal case, ie not xlog switch */
1186
1187                 /* Need to update shared LogwrtRqst if some block was filled up */
1188                 if (freespace < SizeOfXLogRecord)
1189                 {
1190                         /* curridx is filled and available for writing out */
1191                         updrqst = true;
1192                 }
1193                 else
1194                 {
1195                         /* if updrqst already set, write through end of previous buf */
1196                         curridx = PrevBufIdx(curridx);
1197                 }
1198                 WriteRqst = XLogCtl->xlblocks[curridx];
1199         }
1200
1201         LWLockRelease(WALInsertLock);
1202
1203         if (updrqst)
1204         {
1205                 /* use volatile pointer to prevent code rearrangement */
1206                 volatile XLogCtlData *xlogctl = XLogCtl;
1207
1208                 SpinLockAcquire(&xlogctl->info_lck);
1209                 /* advance global request to include new block(s) */
1210                 if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
1211                         xlogctl->LogwrtRqst.Write = WriteRqst;
1212                 /* update local result copy while I have the chance */
1213                 LogwrtResult = xlogctl->LogwrtResult;
1214                 SpinLockRelease(&xlogctl->info_lck);
1215         }
1216
1217         XactLastRecEnd = RecPtr;
1218
1219         END_CRIT_SECTION();
1220
1221         return RecPtr;
1222 }
1223
1224 /*
1225  * Determine whether the buffer referenced by an XLogRecData item has to
1226  * be backed up, and if so fill a BkpBlock struct for it.  In any case
1227  * save the buffer's LSN at *lsn.
1228  */
1229 static bool
1230 XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1231                                 XLogRecPtr *lsn, BkpBlock *bkpb)
1232 {
1233         Page            page;
1234
1235         page = BufferGetPage(rdata->buffer);
1236
1237         /*
1238          * XXX We assume page LSN is first data on *every* page that can be passed
1239          * to XLogInsert, whether it otherwise has the standard page layout or
1240          * not.
1241          */
1242         *lsn = PageGetLSN(page);
1243
1244         if (doPageWrites &&
1245                 XLByteLE(PageGetLSN(page), RedoRecPtr))
1246         {
1247                 /*
1248                  * The page needs to be backed up, so set up *bkpb
1249                  */
1250                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1251
1252                 if (rdata->buffer_std)
1253                 {
1254                         /* Assume we can omit data between pd_lower and pd_upper */
1255                         uint16          lower = ((PageHeader) page)->pd_lower;
1256                         uint16          upper = ((PageHeader) page)->pd_upper;
1257
1258                         if (lower >= SizeOfPageHeaderData &&
1259                                 upper > lower &&
1260                                 upper <= BLCKSZ)
1261                         {
1262                                 bkpb->hole_offset = lower;
1263                                 bkpb->hole_length = upper - lower;
1264                         }
1265                         else
1266                         {
1267                                 /* No "hole" to compress out */
1268                                 bkpb->hole_offset = 0;
1269                                 bkpb->hole_length = 0;
1270                         }
1271                 }
1272                 else
1273                 {
1274                         /* Not a standard page header, don't try to eliminate "hole" */
1275                         bkpb->hole_offset = 0;
1276                         bkpb->hole_length = 0;
1277                 }
1278
1279                 return true;                    /* buffer requires backup */
1280         }
1281
1282         return false;                           /* buffer does not need to be backed up */
1283 }
1284
1285 /*
1286  * XLogArchiveNotify
1287  *
1288  * Create an archive notification file
1289  *
1290  * The name of the notification file is the message that will be picked up
1291  * by the archiver, e.g. we write 0000000100000001000000C6.ready
1292  * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1293  * then when complete, rename it to 0000000100000001000000C6.done
1294  */
1295 static void
1296 XLogArchiveNotify(const char *xlog)
1297 {
1298         char            archiveStatusPath[MAXPGPATH];
1299         FILE       *fd;
1300
1301         /* insert an otherwise empty file called <XLOG>.ready */
1302         StatusFilePath(archiveStatusPath, xlog, ".ready");
1303         fd = AllocateFile(archiveStatusPath, "w");
1304         if (fd == NULL)
1305         {
1306                 ereport(LOG,
1307                                 (errcode_for_file_access(),
1308                                  errmsg("could not create archive status file \"%s\": %m",
1309                                                 archiveStatusPath)));
1310                 return;
1311         }
1312         if (FreeFile(fd))
1313         {
1314                 ereport(LOG,
1315                                 (errcode_for_file_access(),
1316                                  errmsg("could not write archive status file \"%s\": %m",
1317                                                 archiveStatusPath)));
1318                 return;
1319         }
1320
1321         /* Notify archiver that it's got something to do */
1322         if (IsUnderPostmaster)
1323                 SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
1324 }
1325
1326 /*
1327  * Convenience routine to notify using log/seg representation of filename
1328  */
1329 static void
1330 XLogArchiveNotifySeg(uint32 log, uint32 seg)
1331 {
1332         char            xlog[MAXFNAMELEN];
1333
1334         XLogFileName(xlog, ThisTimeLineID, log, seg);
1335         XLogArchiveNotify(xlog);
1336 }
1337
1338 /*
1339  * XLogArchiveCheckDone
1340  *
1341  * This is called when we are ready to delete or recycle an old XLOG segment
1342  * file or backup history file.  If it is okay to delete it then return true.
1343  * If it is not time to delete it, make sure a .ready file exists, and return
1344  * false.
1345  *
1346  * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1347  * then return false; else create <XLOG>.ready and return false.
1348  *
1349  * The reason we do things this way is so that if the original attempt to
1350  * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1351  */
1352 static bool
1353 XLogArchiveCheckDone(const char *xlog)
1354 {
1355         char            archiveStatusPath[MAXPGPATH];
1356         struct stat stat_buf;
1357
1358         /* Always deletable if archiving is off */
1359         if (!XLogArchivingActive())
1360                 return true;
1361
1362         /* First check for .done --- this means archiver is done with it */
1363         StatusFilePath(archiveStatusPath, xlog, ".done");
1364         if (stat(archiveStatusPath, &stat_buf) == 0)
1365                 return true;
1366
1367         /* check for .ready --- this means archiver is still busy with it */
1368         StatusFilePath(archiveStatusPath, xlog, ".ready");
1369         if (stat(archiveStatusPath, &stat_buf) == 0)
1370                 return false;
1371
1372         /* Race condition --- maybe archiver just finished, so recheck */
1373         StatusFilePath(archiveStatusPath, xlog, ".done");
1374         if (stat(archiveStatusPath, &stat_buf) == 0)
1375                 return true;
1376
1377         /* Retry creation of the .ready file */
1378         XLogArchiveNotify(xlog);
1379         return false;
1380 }
1381
1382 /*
1383  * XLogArchiveIsBusy
1384  *
1385  * Check to see if an XLOG segment file is still unarchived.
1386  * This is almost but not quite the inverse of XLogArchiveCheckDone: in
1387  * the first place we aren't chartered to recreate the .ready file, and
1388  * in the second place we should consider that if the file is already gone
1389  * then it's not busy.  (This check is needed to handle the race condition
1390  * that a checkpoint already deleted the no-longer-needed file.)
1391  */
1392 static bool
1393 XLogArchiveIsBusy(const char *xlog)
1394 {
1395         char            archiveStatusPath[MAXPGPATH];
1396         struct stat stat_buf;
1397
1398         /* First check for .done --- this means archiver is done with it */
1399         StatusFilePath(archiveStatusPath, xlog, ".done");
1400         if (stat(archiveStatusPath, &stat_buf) == 0)
1401                 return false;
1402
1403         /* check for .ready --- this means archiver is still busy with it */
1404         StatusFilePath(archiveStatusPath, xlog, ".ready");
1405         if (stat(archiveStatusPath, &stat_buf) == 0)
1406                 return true;
1407
1408         /* Race condition --- maybe archiver just finished, so recheck */
1409         StatusFilePath(archiveStatusPath, xlog, ".done");
1410         if (stat(archiveStatusPath, &stat_buf) == 0)
1411                 return false;
1412
1413         /*
1414          * Check to see if the WAL file has been removed by checkpoint, which
1415          * implies it has already been archived, and explains why we can't see a
1416          * status file for it.
1417          */
1418         snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
1419         if (stat(archiveStatusPath, &stat_buf) != 0 &&
1420                 errno == ENOENT)
1421                 return false;
1422
1423         return true;
1424 }
1425
1426 /*
1427  * XLogArchiveCleanup
1428  *
1429  * Cleanup archive notification file(s) for a particular xlog segment
1430  */
1431 static void
1432 XLogArchiveCleanup(const char *xlog)
1433 {
1434         char            archiveStatusPath[MAXPGPATH];
1435
1436         /* Remove the .done file */
1437         StatusFilePath(archiveStatusPath, xlog, ".done");
1438         unlink(archiveStatusPath);
1439         /* should we complain about failure? */
1440
1441         /* Remove the .ready file if present --- normally it shouldn't be */
1442         StatusFilePath(archiveStatusPath, xlog, ".ready");
1443         unlink(archiveStatusPath);
1444         /* should we complain about failure? */
1445 }
1446
1447 /*
1448  * Advance the Insert state to the next buffer page, writing out the next
1449  * buffer if it still contains unwritten data.
1450  *
1451  * If new_segment is TRUE then we set up the next buffer page as the first
1452  * page of the next xlog segment file, possibly but not usually the next
1453  * consecutive file page.
1454  *
1455  * The global LogwrtRqst.Write pointer needs to be advanced to include the
1456  * just-filled page.  If we can do this for free (without an extra lock),
1457  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
1458  * request update still needs to be done, FALSE if we did it internally.
1459  *
1460  * Must be called with WALInsertLock held.
1461  */
1462 static bool
1463 AdvanceXLInsertBuffer(bool new_segment)
1464 {
1465         XLogCtlInsert *Insert = &XLogCtl->Insert;
1466         int                     nextidx = NextBufIdx(Insert->curridx);
1467         bool            update_needed = true;
1468         XLogRecPtr      OldPageRqstPtr;
1469         XLogwrtRqst WriteRqst;
1470         XLogRecPtr      NewPageEndPtr;
1471         XLogPageHeader NewPage;
1472
1473         /*
1474          * Get ending-offset of the buffer page we need to replace (this may be
1475          * zero if the buffer hasn't been used yet).  Fall through if it's already
1476          * written out.
1477          */
1478         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1479         if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1480         {
1481                 /* nope, got work to do... */
1482                 XLogRecPtr      FinishedPageRqstPtr;
1483
1484                 FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1485
1486                 /* Before waiting, get info_lck and update LogwrtResult */
1487                 {
1488                         /* use volatile pointer to prevent code rearrangement */
1489                         volatile XLogCtlData *xlogctl = XLogCtl;
1490
1491                         SpinLockAcquire(&xlogctl->info_lck);
1492                         if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
1493                                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
1494                         LogwrtResult = xlogctl->LogwrtResult;
1495                         SpinLockRelease(&xlogctl->info_lck);
1496                 }
1497
1498                 update_needed = false;  /* Did the shared-request update */
1499
1500                 /*
1501                  * Now that we have an up-to-date LogwrtResult value, see if we still
1502                  * need to write it or if someone else already did.
1503                  */
1504                 if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1505                 {
1506                         /* Must acquire write lock */
1507                         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1508                         LogwrtResult = XLogCtl->LogwrtResult;
1509                         if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1510                         {
1511                                 /* OK, someone wrote it already */
1512                                 LWLockRelease(WALWriteLock);
1513                         }
1514                         else
1515                         {
1516                                 /*
1517                                  * Have to write buffers while holding insert lock. This is
1518                                  * not good, so only write as much as we absolutely must.
1519                                  */
1520                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
1521                                 WriteRqst.Write = OldPageRqstPtr;
1522                                 WriteRqst.Flush.xlogid = 0;
1523                                 WriteRqst.Flush.xrecoff = 0;
1524                                 XLogWrite(WriteRqst, false, false);
1525                                 LWLockRelease(WALWriteLock);
1526                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1527                         }
1528                 }
1529         }
1530
1531         /*
1532          * Now the next buffer slot is free and we can set it up to be the next
1533          * output page.
1534          */
1535         NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1536
1537         if (new_segment)
1538         {
1539                 /* force it to a segment start point */
1540                 NewPageEndPtr.xrecoff += XLogSegSize - 1;
1541                 NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
1542         }
1543
1544         if (NewPageEndPtr.xrecoff >= XLogFileSize)
1545         {
1546                 /* crossing a logid boundary */
1547                 NewPageEndPtr.xlogid += 1;
1548                 NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1549         }
1550         else
1551                 NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1552         XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1553         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1554
1555         Insert->curridx = nextidx;
1556         Insert->currpage = NewPage;
1557
1558         Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
1559
1560         /*
1561          * Be sure to re-zero the buffer so that bytes beyond what we've written
1562          * will look like zeroes and not valid XLOG records...
1563          */
1564         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1565
1566         /*
1567          * Fill the new page's header
1568          */
1569         NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1570
1571         /* NewPage->xlp_info = 0; */    /* done by memset */
1572         NewPage   ->xlp_tli = ThisTimeLineID;
1573         NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1574         NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
1575
1576         /*
1577          * If online backup is not in progress, mark the header to indicate that
1578          * WAL records beginning in this page have removable backup blocks.  This
1579          * allows the WAL archiver to know whether it is safe to compress archived
1580          * WAL data by transforming full-block records into the non-full-block
1581          * format.      It is sufficient to record this at the page level because we
1582          * force a page switch (in fact a segment switch) when starting a backup,
1583          * so the flag will be off before any records can be written during the
1584          * backup.      At the end of a backup, the last page will be marked as all
1585          * unsafe when perhaps only part is unsafe, but at worst the archiver
1586          * would miss the opportunity to compress a few records.
1587          */
1588         if (!Insert->forcePageWrites)
1589                 NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
1590
1591         /*
1592          * If first page of an XLOG segment file, make it a long header.
1593          */
1594         if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
1595         {
1596                 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1597
1598                 NewLongPage->xlp_sysid = ControlFile->system_identifier;
1599                 NewLongPage->xlp_seg_size = XLogSegSize;
1600                 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1601                 NewPage   ->xlp_info |= XLP_LONG_HEADER;
1602
1603                 Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1604         }
1605
1606         return update_needed;
1607 }
1608
1609 /*
1610  * Check whether we've consumed enough xlog space that a checkpoint is needed.
1611  *
1612  * logid/logseg indicate a log file that has just been filled up (or read
1613  * during recovery). We measure the distance from RedoRecPtr to logid/logseg
1614  * and see if that exceeds CheckPointSegments.
1615  *
1616  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
1617  */
1618 static bool
1619 XLogCheckpointNeeded(uint32 logid, uint32 logseg)
1620 {
1621         /*
1622          * A straight computation of segment number could overflow 32 bits. Rather
1623          * than assuming we have working 64-bit arithmetic, we compare the
1624          * highest-order bits separately, and force a checkpoint immediately when
1625          * they change.
1626          */
1627         uint32          old_segno,
1628                                 new_segno;
1629         uint32          old_highbits,
1630                                 new_highbits;
1631
1632         old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
1633                 (RedoRecPtr.xrecoff / XLogSegSize);
1634         old_highbits = RedoRecPtr.xlogid / XLogSegSize;
1635         new_segno = (logid % XLogSegSize) * XLogSegsPerFile + logseg;
1636         new_highbits = logid / XLogSegSize;
1637         if (new_highbits != old_highbits ||
1638                 new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
1639                 return true;
1640         return false;
1641 }
1642
1643 /*
1644  * Write and/or fsync the log at least as far as WriteRqst indicates.
1645  *
1646  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
1647  * may stop at any convenient boundary (such as a cache or logfile boundary).
1648  * This option allows us to avoid uselessly issuing multiple writes when a
1649  * single one would do.
1650  *
1651  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
1652  * perform end-of-segment actions after writing the last page, even if
1653  * it's not physically the end of its segment.  (NB: this will work properly
1654  * only if caller specifies WriteRqst == page-end and flexible == false,
1655  * and there is some data to write.)
1656  *
1657  * Must be called with WALWriteLock held.
1658  */
1659 static void
1660 XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1661 {
1662         XLogCtlWrite *Write = &XLogCtl->Write;
1663         bool            ispartialpage;
1664         bool            last_iteration;
1665         bool            finishing_seg;
1666         bool            use_existent;
1667         int                     curridx;
1668         int                     npages;
1669         int                     startidx;
1670         uint32          startoffset;
1671
1672         /* We should always be inside a critical section here */
1673         Assert(CritSectionCount > 0);
1674
1675         /*
1676          * Update local LogwrtResult (caller probably did this already, but...)
1677          */
1678         LogwrtResult = XLogCtl->LogwrtResult;
1679
1680         /*
1681          * Since successive pages in the xlog cache are consecutively allocated,
1682          * we can usually gather multiple pages together and issue just one
1683          * write() call.  npages is the number of pages we have determined can be
1684          * written together; startidx is the cache block index of the first one,
1685          * and startoffset is the file offset at which it should go. The latter
1686          * two variables are only valid when npages > 0, but we must initialize
1687          * all of them to keep the compiler quiet.
1688          */
1689         npages = 0;
1690         startidx = 0;
1691         startoffset = 0;
1692
1693         /*
1694          * Within the loop, curridx is the cache block index of the page to
1695          * consider writing.  We advance Write->curridx only after successfully
1696          * writing pages.  (Right now, this refinement is useless since we are
1697          * going to PANIC if any error occurs anyway; but someday it may come in
1698          * useful.)
1699          */
1700         curridx = Write->curridx;
1701
1702         while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1703         {
1704                 /*
1705                  * Make sure we're not ahead of the insert process.  This could happen
1706                  * if we're passed a bogus WriteRqst.Write that is past the end of the
1707                  * last page that's been initialized by AdvanceXLInsertBuffer.
1708                  */
1709                 if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1710                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1711                                  LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1712                                  XLogCtl->xlblocks[curridx].xlogid,
1713                                  XLogCtl->xlblocks[curridx].xrecoff);
1714
1715                 /* Advance LogwrtResult.Write to end of current buffer page */
1716                 LogwrtResult.Write = XLogCtl->xlblocks[curridx];
1717                 ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
1718
1719                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1720                 {
1721                         /*
1722                          * Switch to new logfile segment.  We cannot have any pending
1723                          * pages here (since we dump what we have at segment end).
1724                          */
1725                         Assert(npages == 0);
1726                         if (openLogFile >= 0)
1727                                 XLogFileClose();
1728                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1729
1730                         /* create/use new log file */
1731                         use_existent = true;
1732                         openLogFile = XLogFileInit(openLogId, openLogSeg,
1733                                                                            &use_existent, true);
1734                         openLogOff = 0;
1735                 }
1736
1737                 /* Make sure we have the current logfile open */
1738                 if (openLogFile < 0)
1739                 {
1740                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1741                         openLogFile = XLogFileOpen(openLogId, openLogSeg);
1742                         openLogOff = 0;
1743                 }
1744
1745                 /* Add current page to the set of pending pages-to-dump */
1746                 if (npages == 0)
1747                 {
1748                         /* first of group */
1749                         startidx = curridx;
1750                         startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
1751                 }
1752                 npages++;
1753
1754                 /*
1755                  * Dump the set if this will be the last loop iteration, or if we are
1756                  * at the last page of the cache area (since the next page won't be
1757                  * contiguous in memory), or if we are at the end of the logfile
1758                  * segment.
1759                  */
1760                 last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);
1761
1762                 finishing_seg = !ispartialpage &&
1763                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1764
1765                 if (last_iteration ||
1766                         curridx == XLogCtl->XLogCacheBlck ||
1767                         finishing_seg)
1768                 {
1769                         char       *from;
1770                         Size            nbytes;
1771
1772                         /* Need to seek in the file? */
1773                         if (openLogOff != startoffset)
1774                         {
1775                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
1776                                         ereport(PANIC,
1777                                                         (errcode_for_file_access(),
1778                                                          errmsg("could not seek in log file %u, "
1779                                                                         "segment %u to offset %u: %m",
1780                                                                         openLogId, openLogSeg, startoffset)));
1781                                 openLogOff = startoffset;
1782                         }
1783
1784                         /* OK to write the page(s) */
1785                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
1786                         nbytes = npages * (Size) XLOG_BLCKSZ;
1787                         errno = 0;
1788                         if (write(openLogFile, from, nbytes) != nbytes)
1789                         {
1790                                 /* if write didn't set errno, assume no disk space */
1791                                 if (errno == 0)
1792                                         errno = ENOSPC;
1793                                 ereport(PANIC,
1794                                                 (errcode_for_file_access(),
1795                                                  errmsg("could not write to log file %u, segment %u "
1796                                                                 "at offset %u, length %lu: %m",
1797                                                                 openLogId, openLogSeg,
1798                                                                 openLogOff, (unsigned long) nbytes)));
1799                         }
1800
1801                         /* Update state for write */
1802                         openLogOff += nbytes;
1803                         Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
1804                         npages = 0;
1805
1806                         /*
1807                          * If we just wrote the whole last page of a logfile segment,
1808                          * fsync the segment immediately.  This avoids having to go back
1809                          * and re-open prior segments when an fsync request comes along
1810                          * later. Doing it here ensures that one and only one backend will
1811                          * perform this fsync.
1812                          *
1813                          * We also do this if this is the last page written for an xlog
1814                          * switch.
1815                          *
1816                          * This is also the right place to notify the Archiver that the
1817                          * segment is ready to copy to archival storage, and to update the
1818                          * timer for archive_timeout, and to signal for a checkpoint if
1819                          * too many logfile segments have been used since the last
1820                          * checkpoint.
1821                          */
1822                         if (finishing_seg || (xlog_switch && last_iteration))
1823                         {
1824                                 issue_xlog_fsync(openLogFile, openLogId, openLogSeg);
1825                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
1826
1827                                 if (XLogArchivingActive())
1828                                         XLogArchiveNotifySeg(openLogId, openLogSeg);
1829
1830                                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1831
1832                                 /*
1833                                  * Request a checkpoint if we've consumed too much xlog since
1834                                  * the last one.  For speed, we first check using the local
1835                                  * copy of RedoRecPtr, which might be out of date; if it looks
1836                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
1837                                  * recheck.
1838                                  */
1839                                 if (IsUnderPostmaster &&
1840                                         XLogCheckpointNeeded(openLogId, openLogSeg))
1841                                 {
1842                                         (void) GetRedoRecPtr();
1843                                         if (XLogCheckpointNeeded(openLogId, openLogSeg))
1844                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1845                                 }
1846                         }
1847                 }
1848
1849                 if (ispartialpage)
1850                 {
1851                         /* Only asked to write a partial page */
1852                         LogwrtResult.Write = WriteRqst.Write;
1853                         break;
1854                 }
1855                 curridx = NextBufIdx(curridx);
1856
1857                 /* If flexible, break out of loop as soon as we wrote something */
1858                 if (flexible && npages == 0)
1859                         break;
1860         }
1861
1862         Assert(npages == 0);
1863         Assert(curridx == Write->curridx);
1864
1865         /*
1866          * If asked to flush, do so
1867          */
1868         if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
1869                 XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1870         {
1871                 /*
1872                  * Could get here without iterating above loop, in which case we might
1873                  * have no open file or the wrong one.  However, we do not need to
1874                  * fsync more than one file.
1875                  */
1876                 if (sync_method != SYNC_METHOD_OPEN &&
1877                         sync_method != SYNC_METHOD_OPEN_DSYNC)
1878                 {
1879                         if (openLogFile >= 0 &&
1880                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1881                                 XLogFileClose();
1882                         if (openLogFile < 0)
1883                         {
1884                                 XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1885                                 openLogFile = XLogFileOpen(openLogId, openLogSeg);
1886                                 openLogOff = 0;
1887                         }
1888                         issue_xlog_fsync(openLogFile, openLogId, openLogSeg);
1889                 }
1890                 LogwrtResult.Flush = LogwrtResult.Write;
1891         }
1892
1893         /*
1894          * Update shared-memory status
1895          *
1896          * We make sure that the shared 'request' values do not fall behind the
1897          * 'result' values.  This is not absolutely essential, but it saves some
1898          * code in a couple of places.
1899          */
1900         {
1901                 /* use volatile pointer to prevent code rearrangement */
1902                 volatile XLogCtlData *xlogctl = XLogCtl;
1903
1904                 SpinLockAcquire(&xlogctl->info_lck);
1905                 xlogctl->LogwrtResult = LogwrtResult;
1906                 if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1907                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1908                 if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1909                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1910                 SpinLockRelease(&xlogctl->info_lck);
1911         }
1912 }
1913
1914 /*
1915  * Record the LSN for an asynchronous transaction commit/abort
1916  * and nudge the WALWriter if there is work for it to do.
1917  * (This should not be called for synchronous commits.)
1918  */
1919 void
1920 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
1921 {
1922         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
1923         bool            sleeping;
1924
1925         /* use volatile pointer to prevent code rearrangement */
1926         volatile XLogCtlData *xlogctl = XLogCtl;
1927
1928         SpinLockAcquire(&xlogctl->info_lck);
1929         LogwrtResult = xlogctl->LogwrtResult;
1930         sleeping = xlogctl->WalWriterSleeping;
1931         if (XLByteLT(xlogctl->asyncXactLSN, asyncXactLSN))
1932                 xlogctl->asyncXactLSN = asyncXactLSN;
1933         SpinLockRelease(&xlogctl->info_lck);
1934
1935         /*
1936          * If the WALWriter is sleeping, we should kick it to make it come out of
1937          * low-power mode.      Otherwise, determine whether there's a full page of
1938          * WAL available to write.
1939          */
1940         if (!sleeping)
1941         {
1942                 /* back off to last completed page boundary */
1943                 WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
1944
1945                 /* if we have already flushed that far, we're done */
1946                 if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
1947                         return;
1948         }
1949
1950         /*
1951          * Nudge the WALWriter: it has a full page of WAL to write, or we want it
1952          * to come out of low-power mode so that this async commit will reach disk
1953          * within the expected amount of time.
1954          */
1955         if (ProcGlobal->walwriterLatch)
1956                 SetLatch(ProcGlobal->walwriterLatch);
1957 }
1958
1959 /*
1960  * Advance minRecoveryPoint in control file.
1961  *
1962  * If we crash during recovery, we must reach this point again before the
1963  * database is consistent.
1964  *
1965  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
1966  * is only updated if it's not already greater than or equal to 'lsn'.
1967  */
1968 static void
1969 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
1970 {
1971         /* Quick check using our local copy of the variable */
1972         if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
1973                 return;
1974
1975         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1976
1977         /* update local copy */
1978         minRecoveryPoint = ControlFile->minRecoveryPoint;
1979
1980         /*
1981          * An invalid minRecoveryPoint means that we need to recover all the WAL,
1982          * i.e., we're doing crash recovery.  We never modify the control file's
1983          * value in that case, so we can short-circuit future checks here too.
1984          */
1985         if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
1986                 updateMinRecoveryPoint = false;
1987         else if (force || XLByteLT(minRecoveryPoint, lsn))
1988         {
1989                 /* use volatile pointer to prevent code rearrangement */
1990                 volatile XLogCtlData *xlogctl = XLogCtl;
1991                 XLogRecPtr      newMinRecoveryPoint;
1992
1993                 /*
1994                  * To avoid having to update the control file too often, we update it
1995                  * all the way to the last record being replayed, even though 'lsn'
1996                  * would suffice for correctness.  This also allows the 'force' case
1997                  * to not need a valid 'lsn' value.
1998                  *
1999                  * Another important reason for doing it this way is that the passed
2000                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2001                  * the caller got it from a corrupted heap page.  Accepting such a
2002                  * value as the min recovery point would prevent us from coming up at
2003                  * all.  Instead, we just log a warning and continue with recovery.
2004                  * (See also the comments about corrupt LSNs in XLogFlush.)
2005                  */
2006                 SpinLockAcquire(&xlogctl->info_lck);
2007                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
2008                 SpinLockRelease(&xlogctl->info_lck);
2009
2010                 if (!force && XLByteLT(newMinRecoveryPoint, lsn))
2011                         elog(WARNING,
2012                            "xlog min recovery request %X/%X is past current point %X/%X",
2013                                  lsn.xlogid, lsn.xrecoff,
2014                                  newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff);
2015
2016                 /* update control file */
2017                 if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
2018                 {
2019                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2020                         UpdateControlFile();
2021                         minRecoveryPoint = newMinRecoveryPoint;
2022
2023                         ereport(DEBUG2,
2024                                         (errmsg("updated min recovery point to %X/%X",
2025                                                 minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
2026                 }
2027         }
2028         LWLockRelease(ControlFileLock);
2029 }
2030
2031 /*
2032  * Ensure that all XLOG data through the given position is flushed to disk.
2033  *
2034  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2035  * already held, and we try to avoid acquiring it if possible.
2036  */
2037 void
2038 XLogFlush(XLogRecPtr record)
2039 {
2040         XLogRecPtr      WriteRqstPtr;
2041         XLogwrtRqst WriteRqst;
2042
2043         /*
2044          * During REDO, we are reading not writing WAL.  Therefore, instead of
2045          * trying to flush the WAL, we should update minRecoveryPoint instead. We
2046          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2047          * to act this way too, and because when it tries to write the
2048          * end-of-recovery checkpoint, it should indeed flush.
2049          */
2050         if (!XLogInsertAllowed())
2051         {
2052                 UpdateMinRecoveryPoint(record, false);
2053                 return;
2054         }
2055
2056         /* Quick exit if already known flushed */
2057         if (XLByteLE(record, LogwrtResult.Flush))
2058                 return;
2059
2060 #ifdef WAL_DEBUG
2061         if (XLOG_DEBUG)
2062                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2063                          record.xlogid, record.xrecoff,
2064                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
2065                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2066 #endif
2067
2068         START_CRIT_SECTION();
2069
2070         /*
2071          * Since fsync is usually a horribly expensive operation, we try to
2072          * piggyback as much data as we can on each fsync: if we see any more data
2073          * entered into the xlog buffer, we'll write and fsync that too, so that
2074          * the final value of LogwrtResult.Flush is as large as possible. This
2075          * gives us some chance of avoiding another fsync immediately after.
2076          */
2077
2078         /* initialize to given target; may increase below */
2079         WriteRqstPtr = record;
2080
2081         /*
2082          * Now wait until we get the write lock, or someone else does the flush
2083          * for us.
2084          */
2085         for (;;)
2086         {
2087                 /* use volatile pointer to prevent code rearrangement */
2088                 volatile XLogCtlData *xlogctl = XLogCtl;
2089
2090                 /* read LogwrtResult and update local state */
2091                 SpinLockAcquire(&xlogctl->info_lck);
2092                 if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
2093                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2094                 LogwrtResult = xlogctl->LogwrtResult;
2095                 SpinLockRelease(&xlogctl->info_lck);
2096
2097                 /* done already? */
2098                 if (XLByteLE(record, LogwrtResult.Flush))
2099                         break;
2100
2101                 /*
2102                  * Try to get the write lock. If we can't get it immediately, wait
2103                  * until it's released, and recheck if we still need to do the flush
2104                  * or if the backend that held the lock did it for us already. This
2105                  * helps to maintain a good rate of group committing when the system
2106                  * is bottlenecked by the speed of fsyncing.
2107                  */
2108                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2109                 {
2110                         /*
2111                          * The lock is now free, but we didn't acquire it yet. Before we
2112                          * do, loop back to check if someone else flushed the record for
2113                          * us already.
2114                          */
2115                         continue;
2116                 }
2117                 /* Got the lock */
2118                 LogwrtResult = XLogCtl->LogwrtResult;
2119                 if (!XLByteLE(record, LogwrtResult.Flush))
2120                 {
2121                         /* try to write/flush later additions to XLOG as well */
2122                         if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
2123                         {
2124                                 XLogCtlInsert *Insert = &XLogCtl->Insert;
2125                                 uint32          freespace = INSERT_FREESPACE(Insert);
2126
2127                                 if (freespace < SizeOfXLogRecord)               /* buffer is full */
2128                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
2129                                 else
2130                                 {
2131                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
2132                                         WriteRqstPtr.xrecoff -= freespace;
2133                                 }
2134                                 LWLockRelease(WALInsertLock);
2135                                 WriteRqst.Write = WriteRqstPtr;
2136                                 WriteRqst.Flush = WriteRqstPtr;
2137                         }
2138                         else
2139                         {
2140                                 WriteRqst.Write = WriteRqstPtr;
2141                                 WriteRqst.Flush = record;
2142                         }
2143                         XLogWrite(WriteRqst, false, false);
2144                 }
2145                 LWLockRelease(WALWriteLock);
2146                 /* done */
2147                 break;
2148         }
2149
2150         END_CRIT_SECTION();
2151
2152         /*
2153          * If we still haven't flushed to the request point then we have a
2154          * problem; most likely, the requested flush point is past end of XLOG.
2155          * This has been seen to occur when a disk page has a corrupted LSN.
2156          *
2157          * Formerly we treated this as a PANIC condition, but that hurts the
2158          * system's robustness rather than helping it: we do not want to take down
2159          * the whole system due to corruption on one data page.  In particular, if
2160          * the bad page is encountered again during recovery then we would be
2161          * unable to restart the database at all!  (This scenario actually
2162          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2163          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2164          * the only time we can reach here during recovery is while flushing the
2165          * end-of-recovery checkpoint record, and we don't expect that to have a
2166          * bad LSN.
2167          *
2168          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2169          * since xact.c calls this routine inside a critical section.  However,
2170          * calls from bufmgr.c are not within critical sections and so we will not
2171          * force a restart for a bad LSN on a data page.
2172          */
2173         if (XLByteLT(LogwrtResult.Flush, record))
2174                 elog(ERROR,
2175                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2176                          record.xlogid, record.xrecoff,
2177                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2178 }
2179
2180 /*
2181  * Flush xlog, but without specifying exactly where to flush to.
2182  *
2183  * We normally flush only completed blocks; but if there is nothing to do on
2184  * that basis, we check for unflushed async commits in the current incomplete
2185  * block, and flush through the latest one of those.  Thus, if async commits
2186  * are not being used, we will flush complete blocks only.      We can guarantee
2187  * that async commits reach disk after at most three cycles; normally only
2188  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
2189  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
2190  * difference only with very high load or long wal_writer_delay, but imposes
2191  * one extra cycle for the worst case for async commits.)
2192  *
2193  * This routine is invoked periodically by the background walwriter process.
2194  *
2195  * Returns TRUE if we flushed anything.
2196  */
2197 bool
2198 XLogBackgroundFlush(void)
2199 {
2200         XLogRecPtr      WriteRqstPtr;
2201         bool            flexible = true;
2202         bool            wrote_something = false;
2203
2204         /* XLOG doesn't need flushing during recovery */
2205         if (RecoveryInProgress())
2206                 return false;
2207
2208         /* read LogwrtResult and update local state */
2209         {
2210                 /* use volatile pointer to prevent code rearrangement */
2211                 volatile XLogCtlData *xlogctl = XLogCtl;
2212
2213                 SpinLockAcquire(&xlogctl->info_lck);
2214                 LogwrtResult = xlogctl->LogwrtResult;
2215                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2216                 SpinLockRelease(&xlogctl->info_lck);
2217         }
2218
2219         /* back off to last completed page boundary */
2220         WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
2221
2222         /* if we have already flushed that far, consider async commit records */
2223         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2224         {
2225                 /* use volatile pointer to prevent code rearrangement */
2226                 volatile XLogCtlData *xlogctl = XLogCtl;
2227
2228                 SpinLockAcquire(&xlogctl->info_lck);
2229                 WriteRqstPtr = xlogctl->asyncXactLSN;
2230                 SpinLockRelease(&xlogctl->info_lck);
2231                 flexible = false;               /* ensure it all gets written */
2232         }
2233
2234         /*
2235          * If already known flushed, we're done. Just need to check if we are
2236          * holding an open file handle to a logfile that's no longer in use,
2237          * preventing the file from being deleted.
2238          */
2239         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2240         {
2241                 if (openLogFile >= 0)
2242                 {
2243                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
2244                         {
2245                                 XLogFileClose();
2246                         }
2247                 }
2248                 return false;
2249         }
2250
2251 #ifdef WAL_DEBUG
2252         if (XLOG_DEBUG)
2253                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
2254                          WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
2255                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
2256                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2257 #endif
2258
2259         START_CRIT_SECTION();
2260
2261         /* now wait for the write lock */
2262         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2263         LogwrtResult = XLogCtl->LogwrtResult;
2264         if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2265         {
2266                 XLogwrtRqst WriteRqst;
2267
2268                 WriteRqst.Write = WriteRqstPtr;
2269                 WriteRqst.Flush = WriteRqstPtr;
2270                 XLogWrite(WriteRqst, flexible, false);
2271                 wrote_something = true;
2272         }
2273         LWLockRelease(WALWriteLock);
2274
2275         END_CRIT_SECTION();
2276
2277         /*
2278          * If we wrote something then we have something to send to standbys also,
2279          * otherwise the replication delay become around 7s with just async
2280          * commit.
2281          */
2282         if (wrote_something)
2283                 WalSndWakeup();
2284
2285         return wrote_something;
2286 }
2287
2288 /*
2289  * Test whether XLOG data has been flushed up to (at least) the given position.
2290  *
2291  * Returns true if a flush is still needed.  (It may be that someone else
2292  * is already in process of flushing that far, however.)
2293  */
2294 bool
2295 XLogNeedsFlush(XLogRecPtr record)
2296 {
2297         /*
2298          * During recovery, we don't flush WAL but update minRecoveryPoint
2299          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
2300          * would need to be updated.
2301          */
2302         if (RecoveryInProgress())
2303         {
2304                 /* Quick exit if already known updated */
2305                 if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
2306                         return false;
2307
2308                 /*
2309                  * Update local copy of minRecoveryPoint. But if the lock is busy,
2310                  * just return a conservative guess.
2311                  */
2312                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
2313                         return true;
2314                 minRecoveryPoint = ControlFile->minRecoveryPoint;
2315                 LWLockRelease(ControlFileLock);
2316
2317                 /*
2318                  * An invalid minRecoveryPoint means that we need to recover all the
2319                  * WAL, i.e., we're doing crash recovery.  We never modify the control
2320                  * file's value in that case, so we can short-circuit future checks
2321                  * here too.
2322                  */
2323                 if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
2324                         updateMinRecoveryPoint = false;
2325
2326                 /* check again */
2327                 if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
2328                         return false;
2329                 else
2330                         return true;
2331         }
2332
2333         /* Quick exit if already known flushed */
2334         if (XLByteLE(record, LogwrtResult.Flush))
2335                 return false;
2336
2337         /* read LogwrtResult and update local state */
2338         {
2339                 /* use volatile pointer to prevent code rearrangement */
2340                 volatile XLogCtlData *xlogctl = XLogCtl;
2341
2342                 SpinLockAcquire(&xlogctl->info_lck);
2343                 LogwrtResult = xlogctl->LogwrtResult;
2344                 SpinLockRelease(&xlogctl->info_lck);
2345         }
2346
2347         /* check again */
2348         if (XLByteLE(record, LogwrtResult.Flush))
2349                 return false;
2350
2351         return true;
2352 }
2353
2354 /*
2355  * Create a new XLOG file segment, or open a pre-existing one.
2356  *
2357  * log, seg: identify segment to be created/opened.
2358  *
2359  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
2360  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
2361  * file was used.
2362  *
2363  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2364  * place.  This should be TRUE except during bootstrap log creation.  The
2365  * caller must *not* hold the lock at call.
2366  *
2367  * Returns FD of opened file.
2368  *
2369  * Note: errors here are ERROR not PANIC because we might or might not be
2370  * inside a critical section (eg, during checkpoint there is no reason to
2371  * take down the system on failure).  They will promote to PANIC if we are
2372  * in a critical section.
2373  */
2374 int
2375 XLogFileInit(uint32 log, uint32 seg,
2376                          bool *use_existent, bool use_lock)
2377 {
2378         char            path[MAXPGPATH];
2379         char            tmppath[MAXPGPATH];
2380         char       *zbuffer;
2381         uint32          installed_log;
2382         uint32          installed_seg;
2383         int                     max_advance;
2384         int                     fd;
2385         int                     nbytes;
2386
2387         XLogFilePath(path, ThisTimeLineID, log, seg);
2388
2389         /*
2390          * Try to use existent file (checkpoint maker may have created it already)
2391          */
2392         if (*use_existent)
2393         {
2394                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2395                                                    S_IRUSR | S_IWUSR);
2396                 if (fd < 0)
2397                 {
2398                         if (errno != ENOENT)
2399                                 ereport(ERROR,
2400                                                 (errcode_for_file_access(),
2401                                                  errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2402                                                                 path, log, seg)));
2403                 }
2404                 else
2405                         return fd;
2406         }
2407
2408         /*
2409          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2410          * another process is doing the same thing.  If so, we will end up
2411          * pre-creating an extra log segment.  That seems OK, and better than
2412          * holding the lock throughout this lengthy process.
2413          */
2414         elog(DEBUG2, "creating and filling new WAL file");
2415
2416         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2417
2418         unlink(tmppath);
2419
2420         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2421         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2422                                            S_IRUSR | S_IWUSR);
2423         if (fd < 0)
2424                 ereport(ERROR,
2425                                 (errcode_for_file_access(),
2426                                  errmsg("could not create file \"%s\": %m", tmppath)));
2427
2428         /*
2429          * Zero-fill the file.  We have to do this the hard way to ensure that all
2430          * the file space has really been allocated --- on platforms that allow
2431          * "holes" in files, just seeking to the end doesn't allocate intermediate
2432          * space.  This way, we know that we have all the space and (after the
2433          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2434          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2435          * log file.
2436          *
2437          * Note: palloc zbuffer, instead of just using a local char array, to
2438          * ensure it is reasonably well-aligned; this may save a few cycles
2439          * transferring data to the kernel.
2440          */
2441         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
2442         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2443         {
2444                 errno = 0;
2445                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2446                 {
2447                         int                     save_errno = errno;
2448
2449                         /*
2450                          * If we fail to make the file, delete it to release disk space
2451                          */
2452                         unlink(tmppath);
2453                         /* if write didn't set errno, assume problem is no disk space */
2454                         errno = save_errno ? save_errno : ENOSPC;
2455
2456                         ereport(ERROR,
2457                                         (errcode_for_file_access(),
2458                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2459                 }
2460         }
2461         pfree(zbuffer);
2462
2463         if (pg_fsync(fd) != 0)
2464                 ereport(ERROR,
2465                                 (errcode_for_file_access(),
2466                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2467
2468         if (close(fd))
2469                 ereport(ERROR,
2470                                 (errcode_for_file_access(),
2471                                  errmsg("could not close file \"%s\": %m", tmppath)));
2472
2473         /*
2474          * Now move the segment into place with its final name.
2475          *
2476          * If caller didn't want to use a pre-existing file, get rid of any
2477          * pre-existing file.  Otherwise, cope with possibility that someone else
2478          * has created the file while we were filling ours: if so, use ours to
2479          * pre-create a future log segment.
2480          */
2481         installed_log = log;
2482         installed_seg = seg;
2483         max_advance = XLOGfileslop;
2484         if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
2485                                                                 *use_existent, &max_advance,
2486                                                                 use_lock))
2487         {
2488                 /*
2489                  * No need for any more future segments, or InstallXLogFileSegment()
2490                  * failed to rename the file into place. If the rename failed, opening
2491                  * the file below will fail.
2492                  */
2493                 unlink(tmppath);
2494         }
2495
2496         /* Set flag to tell caller there was no existent file */
2497         *use_existent = false;
2498
2499         /* Now open original target segment (might not be file I just made) */
2500         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2501                                            S_IRUSR | S_IWUSR);
2502         if (fd < 0)
2503                 ereport(ERROR,
2504                                 (errcode_for_file_access(),
2505                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2506                                   path, log, seg)));
2507
2508         elog(DEBUG2, "done creating and filling new WAL file");
2509
2510         return fd;
2511 }
2512
2513 /*
2514  * Create a new XLOG file segment by copying a pre-existing one.
2515  *
2516  * log, seg: identify segment to be created.
2517  *
2518  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
2519  *              a different timeline)
2520  *
2521  * Currently this is only used during recovery, and so there are no locking
2522  * considerations.      But we should be just as tense as XLogFileInit to avoid
2523  * emplacing a bogus file.
2524  */
2525 static void
2526 XLogFileCopy(uint32 log, uint32 seg,
2527                          TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
2528 {
2529         char            path[MAXPGPATH];
2530         char            tmppath[MAXPGPATH];
2531         char            buffer[XLOG_BLCKSZ];
2532         int                     srcfd;
2533         int                     fd;
2534         int                     nbytes;
2535
2536         /*
2537          * Open the source file
2538          */
2539         XLogFilePath(path, srcTLI, srclog, srcseg);
2540         srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2541         if (srcfd < 0)
2542                 ereport(ERROR,
2543                                 (errcode_for_file_access(),
2544                                  errmsg("could not open file \"%s\": %m", path)));
2545
2546         /*
2547          * Copy into a temp file name.
2548          */
2549         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2550
2551         unlink(tmppath);
2552
2553         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2554         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2555                                            S_IRUSR | S_IWUSR);
2556         if (fd < 0)
2557                 ereport(ERROR,
2558                                 (errcode_for_file_access(),
2559                                  errmsg("could not create file \"%s\": %m", tmppath)));
2560
2561         /*
2562          * Do the data copying.
2563          */
2564         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
2565         {
2566                 errno = 0;
2567                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2568                 {
2569                         if (errno != 0)
2570                                 ereport(ERROR,
2571                                                 (errcode_for_file_access(),
2572                                                  errmsg("could not read file \"%s\": %m", path)));
2573                         else
2574                                 ereport(ERROR,
2575                                                 (errmsg("not enough data in file \"%s\"", path)));
2576                 }
2577                 errno = 0;
2578                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2579                 {
2580                         int                     save_errno = errno;
2581
2582                         /*
2583                          * If we fail to make the file, delete it to release disk space
2584                          */
2585                         unlink(tmppath);
2586                         /* if write didn't set errno, assume problem is no disk space */
2587                         errno = save_errno ? save_errno : ENOSPC;
2588
2589                         ereport(ERROR,
2590                                         (errcode_for_file_access(),
2591                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2592                 }
2593         }
2594
2595         if (pg_fsync(fd) != 0)
2596                 ereport(ERROR,
2597                                 (errcode_for_file_access(),
2598                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2599
2600         if (close(fd))
2601                 ereport(ERROR,
2602                                 (errcode_for_file_access(),
2603                                  errmsg("could not close file \"%s\": %m", tmppath)));
2604
2605         close(srcfd);
2606
2607         /*
2608          * Now move the segment into place with its final name.
2609          */
2610         if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false))
2611                 elog(ERROR, "InstallXLogFileSegment should not have failed");
2612 }
2613
2614 /*
2615  * Install a new XLOG segment file as a current or future log segment.
2616  *
2617  * This is used both to install a newly-created segment (which has a temp
2618  * filename while it's being created) and to recycle an old segment.
2619  *
2620  * *log, *seg: identify segment to install as (or first possible target).
2621  * When find_free is TRUE, these are modified on return to indicate the
2622  * actual installation location or last segment searched.
2623  *
2624  * tmppath: initial name of file to install.  It will be renamed into place.
2625  *
2626  * find_free: if TRUE, install the new segment at the first empty log/seg
2627  * number at or after the passed numbers.  If FALSE, install the new segment
2628  * exactly where specified, deleting any existing segment file there.
2629  *
2630  * *max_advance: maximum number of log/seg slots to advance past the starting
2631  * point.  Fail if no free slot is found in this range.  On return, reduced
2632  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
2633  * when find_free is FALSE.)
2634  *
2635  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2636  * place.  This should be TRUE except during bootstrap log creation.  The
2637  * caller must *not* hold the lock at call.
2638  *
2639  * Returns TRUE if the file was installed successfully.  FALSE indicates that
2640  * max_advance limit was exceeded, or an error occurred while renaming the
2641  * file into place.
2642  */
2643 static bool
2644 InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
2645                                            bool find_free, int *max_advance,
2646                                            bool use_lock)
2647 {
2648         char            path[MAXPGPATH];
2649         struct stat stat_buf;
2650
2651         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2652
2653         /*
2654          * We want to be sure that only one process does this at a time.
2655          */
2656         if (use_lock)
2657                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2658
2659         if (!find_free)
2660         {
2661                 /* Force installation: get rid of any pre-existing segment file */
2662                 unlink(path);
2663         }
2664         else
2665         {
2666                 /* Find a free slot to put it in */
2667                 while (stat(path, &stat_buf) == 0)
2668                 {
2669                         if (*max_advance <= 0)
2670                         {
2671                                 /* Failed to find a free slot within specified range */
2672                                 if (use_lock)
2673                                         LWLockRelease(ControlFileLock);
2674                                 return false;
2675                         }
2676                         NextLogSeg(*log, *seg);
2677                         (*max_advance)--;
2678                         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2679                 }
2680         }
2681
2682         /*
2683          * Prefer link() to rename() here just to be really sure that we don't
2684          * overwrite an existing logfile.  However, there shouldn't be one, so
2685          * rename() is an acceptable substitute except for the truly paranoid.
2686          */
2687 #if HAVE_WORKING_LINK
2688         if (link(tmppath, path) < 0)
2689         {
2690                 if (use_lock)
2691                         LWLockRelease(ControlFileLock);
2692                 ereport(LOG,
2693                                 (errcode_for_file_access(),
2694                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2695                                                 tmppath, path, *log, *seg)));
2696                 return false;
2697         }
2698         unlink(tmppath);
2699 #else
2700         if (rename(tmppath, path) < 0)
2701         {
2702                 if (use_lock)
2703                         LWLockRelease(ControlFileLock);
2704                 ereport(LOG,
2705                                 (errcode_for_file_access(),
2706                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2707                                                 tmppath, path, *log, *seg)));
2708                 return false;
2709         }
2710 #endif
2711
2712         if (use_lock)
2713                 LWLockRelease(ControlFileLock);
2714
2715         return true;
2716 }
2717
2718 /*
2719  * Open a pre-existing logfile segment for writing.
2720  */
2721 int
2722 XLogFileOpen(uint32 log, uint32 seg)
2723 {
2724         char            path[MAXPGPATH];
2725         int                     fd;
2726
2727         XLogFilePath(path, ThisTimeLineID, log, seg);
2728
2729         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2730                                            S_IRUSR | S_IWUSR);
2731         if (fd < 0)
2732                 ereport(PANIC,
2733                                 (errcode_for_file_access(),
2734                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2735                                   path, log, seg)));
2736
2737         return fd;
2738 }
2739
2740 /*
2741  * Open a logfile segment for reading (during recovery).
2742  *
2743  * If source = XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
2744  * Otherwise, it's assumed to be already available in pg_xlog.
2745  */
2746 static int
2747 XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
2748                          int source, bool notfoundOk)
2749 {
2750         char            xlogfname[MAXFNAMELEN];
2751         char            activitymsg[MAXFNAMELEN + 16];
2752         char            path[MAXPGPATH];
2753         int                     fd;
2754
2755         XLogFileName(xlogfname, tli, log, seg);
2756
2757         switch (source)
2758         {
2759                 case XLOG_FROM_ARCHIVE:
2760                         /* Report recovery progress in PS display */
2761                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
2762                                          xlogfname);
2763                         set_ps_display(activitymsg, false);
2764
2765                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
2766                                                                                                           "RECOVERYXLOG",
2767                                                                                                           XLogSegSize);
2768                         if (!restoredFromArchive)
2769                                 return -1;
2770                         break;
2771
2772                 case XLOG_FROM_PG_XLOG:
2773                 case XLOG_FROM_STREAM:
2774                         XLogFilePath(path, tli, log, seg);
2775                         restoredFromArchive = false;
2776                         break;
2777
2778                 default:
2779                         elog(ERROR, "invalid XLogFileRead source %d", source);
2780         }
2781
2782         /*
2783          * If the segment was fetched from archival storage, replace the existing
2784          * xlog segment (if any) with the archival version.
2785          */
2786         if (source == XLOG_FROM_ARCHIVE)
2787         {
2788                 /* use volatile pointer to prevent code rearrangement */
2789                 volatile XLogCtlData *xlogctl = XLogCtl;
2790                 XLogRecPtr      endptr;
2791                 char            xlogfpath[MAXPGPATH];
2792                 bool            reload = false;
2793                 struct stat statbuf;
2794
2795                 XLogFilePath(xlogfpath, tli, log, seg);
2796                 if (stat(xlogfpath, &statbuf) == 0)
2797                 {
2798                         if (unlink(xlogfpath) != 0)
2799                                 ereport(FATAL,
2800                                                 (errcode_for_file_access(),
2801                                                  errmsg("could not remove file \"%s\": %m",
2802                                                                 xlogfpath)));
2803                         reload = true;
2804                 }
2805
2806                 if (rename(path, xlogfpath) < 0)
2807                         ereport(ERROR,
2808                                         (errcode_for_file_access(),
2809                                          errmsg("could not rename file \"%s\" to \"%s\": %m",
2810                                                         path, xlogfpath)));
2811
2812                 /*
2813                  * If the existing segment was replaced, since walsenders might have
2814                  * it open, request them to reload a currently-open segment.
2815                  */
2816                 if (reload)
2817                         WalSndRqstFileReload();
2818
2819                 /*
2820                  * Calculate the end location of the restored WAL file and save it in
2821                  * shmem. It's used as current standby flush position, and cascading
2822                  * walsenders try to send WAL records up to this location.
2823                  */
2824                 endptr.xlogid = log;
2825                 endptr.xrecoff = seg * XLogSegSize;
2826                 XLByteAdvance(endptr, XLogSegSize);
2827
2828                 SpinLockAcquire(&xlogctl->info_lck);
2829                 xlogctl->restoreLastRecPtr = endptr;
2830                 SpinLockRelease(&xlogctl->info_lck);
2831
2832                 /* Signal walsender that new WAL has arrived */
2833                 if (AllowCascadeReplication())
2834                         WalSndWakeup();
2835         }
2836
2837         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2838         if (fd >= 0)
2839         {
2840                 /* Success! */
2841                 curFileTLI = tli;
2842
2843                 /* Report recovery progress in PS display */
2844                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
2845                                  xlogfname);
2846                 set_ps_display(activitymsg, false);
2847
2848                 /* Track source of data in assorted state variables */
2849                 readSource = source;
2850                 XLogReceiptSource = source;
2851                 /* In FROM_STREAM case, caller tracks receipt time, not me */
2852                 if (source != XLOG_FROM_STREAM)
2853                         XLogReceiptTime = GetCurrentTimestamp();
2854
2855                 return fd;
2856         }
2857         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
2858                 ereport(PANIC,
2859                                 (errcode_for_file_access(),
2860                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2861                                   path, log, seg)));
2862         return -1;
2863 }
2864
2865 /*
2866  * Open a logfile segment for reading (during recovery).
2867  *
2868  * This version searches for the segment with any TLI listed in expectedTLIs.
2869  */
2870 static int
2871 XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode, int sources)
2872 {
2873         char            path[MAXPGPATH];
2874         ListCell   *cell;
2875         int                     fd;
2876
2877         /*
2878          * Loop looking for a suitable timeline ID: we might need to read any of
2879          * the timelines listed in expectedTLIs.
2880          *
2881          * We expect curFileTLI on entry to be the TLI of the preceding file in
2882          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
2883          * to go backwards; this prevents us from picking up the wrong file when a
2884          * parent timeline extends to higher segment numbers than the child we
2885          * want to read.
2886          */
2887         foreach(cell, expectedTLIs)
2888         {
2889                 TimeLineID      tli = (TimeLineID) lfirst_int(cell);
2890
2891                 if (tli < curFileTLI)
2892                         break;                          /* don't bother looking at too-old TLIs */
2893
2894                 if (sources & XLOG_FROM_ARCHIVE)
2895                 {
2896                         fd = XLogFileRead(log, seg, emode, tli, XLOG_FROM_ARCHIVE, true);
2897                         if (fd != -1)
2898                         {
2899                                 elog(DEBUG1, "got WAL segment from archive");
2900                                 return fd;
2901                         }
2902                 }
2903
2904                 if (sources & XLOG_FROM_PG_XLOG)
2905                 {
2906                         fd = XLogFileRead(log, seg, emode, tli, XLOG_FROM_PG_XLOG, true);
2907                         if (fd != -1)
2908                                 return fd;
2909                 }
2910         }
2911
2912         /* Couldn't find it.  For simplicity, complain about front timeline */
2913         XLogFilePath(path, recoveryTargetTLI, log, seg);
2914         errno = ENOENT;
2915         ereport(emode,
2916                         (errcode_for_file_access(),
2917                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2918                                   path, log, seg)));
2919         return -1;
2920 }
2921
2922 /*
2923  * Close the current logfile segment for writing.
2924  */
2925 static void
2926 XLogFileClose(void)
2927 {
2928         Assert(openLogFile >= 0);
2929
2930         /*
2931          * WAL segment files will not be re-read in normal operation, so we advise
2932          * the OS to release any cached pages.  But do not do so if WAL archiving
2933          * or streaming is active, because archiver and walsender process could
2934          * use the cache to read the WAL segment.
2935          */
2936 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2937         if (!XLogIsNeeded())
2938                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2939 #endif
2940
2941         if (close(openLogFile))
2942                 ereport(PANIC,
2943                                 (errcode_for_file_access(),
2944                                  errmsg("could not close log file %u, segment %u: %m",
2945                                                 openLogId, openLogSeg)));
2946         openLogFile = -1;
2947 }
2948
2949 /*
2950  * Attempt to retrieve the specified file from off-line archival storage.
2951  * If successful, fill "path" with its complete path (note that this will be
2952  * a temp file name that doesn't follow the normal naming convention), and
2953  * return TRUE.
2954  *
2955  * If not successful, fill "path" with the name of the normal on-line file
2956  * (which may or may not actually exist, but we'll try to use it), and return
2957  * FALSE.
2958  *
2959  * For fixed-size files, the caller may pass the expected size as an
2960  * additional crosscheck on successful recovery.  If the file size is not
2961  * known, set expectedSize = 0.
2962  */
2963 static bool
2964 RestoreArchivedFile(char *path, const char *xlogfname,
2965                                         const char *recovername, off_t expectedSize)
2966 {
2967         char            xlogpath[MAXPGPATH];
2968         char            xlogRestoreCmd[MAXPGPATH];
2969         char            lastRestartPointFname[MAXPGPATH];
2970         char       *dp;
2971         char       *endp;
2972         const char *sp;
2973         int                     rc;
2974         bool            signaled;
2975         struct stat stat_buf;
2976         uint32          restartLog;
2977         uint32          restartSeg;
2978
2979         /* In standby mode, restore_command might not be supplied */
2980         if (recoveryRestoreCommand == NULL)
2981                 goto not_available;
2982
2983         /*
2984          * When doing archive recovery, we always prefer an archived log file even
2985          * if a file of the same name exists in XLOGDIR.  The reason is that the
2986          * file in XLOGDIR could be an old, un-filled or partly-filled version
2987          * that was copied and restored as part of backing up $PGDATA.
2988          *
2989          * We could try to optimize this slightly by checking the local copy
2990          * lastchange timestamp against the archived copy, but we have no API to
2991          * do this, nor can we guarantee that the lastchange timestamp was
2992          * preserved correctly when we copied to archive. Our aim is robustness,
2993          * so we elect not to do this.
2994          *
2995          * If we cannot obtain the log file from the archive, however, we will try
2996          * to use the XLOGDIR file if it exists.  This is so that we can make use
2997          * of log segments that weren't yet transferred to the archive.
2998          *
2999          * Notice that we don't actually overwrite any files when we copy back
3000          * from archive because the recoveryRestoreCommand may inadvertently
3001          * restore inappropriate xlogs, or they may be corrupt, so we may wish to
3002          * fallback to the segments remaining in current XLOGDIR later. The
3003          * copy-from-archive filename is always the same, ensuring that we don't
3004          * run out of disk space on long recoveries.
3005          */
3006         snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
3007
3008         /*
3009          * Make sure there is no existing file named recovername.
3010          */
3011         if (stat(xlogpath, &stat_buf) != 0)
3012         {
3013                 if (errno != ENOENT)
3014                         ereport(FATAL,
3015                                         (errcode_for_file_access(),
3016                                          errmsg("could not stat file \"%s\": %m",
3017                                                         xlogpath)));
3018         }
3019         else
3020         {
3021                 if (unlink(xlogpath) != 0)
3022                         ereport(FATAL,
3023                                         (errcode_for_file_access(),
3024                                          errmsg("could not remove file \"%s\": %m",
3025                                                         xlogpath)));
3026         }
3027
3028         /*
3029          * Calculate the archive file cutoff point for use during log shipping
3030          * replication. All files earlier than this point can be deleted from the
3031          * archive, though there is no requirement to do so.
3032          *
3033          * We initialise this with the filename of an InvalidXLogRecPtr, which
3034          * will prevent the deletion of any WAL files from the archive because of
3035          * the alphabetic sorting property of WAL filenames.
3036          *
3037          * Once we have successfully located the redo pointer of the checkpoint
3038          * from which we start recovery we never request a file prior to the redo
3039          * pointer of the last restartpoint. When redo begins we know that we have
3040          * successfully located it, so there is no need for additional status
3041          * flags to signify the point when we can begin deleting WAL files from
3042          * the archive.
3043          */
3044         if (InRedo)
3045         {
3046                 XLByteToSeg(ControlFile->checkPointCopy.redo,
3047                                         restartLog, restartSeg);
3048                 XLogFileName(lastRestartPointFname,
3049                                          ControlFile->checkPointCopy.ThisTimeLineID,
3050                                          restartLog, restartSeg);
3051                 /* we shouldn't need anything earlier than last restart point */
3052                 Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
3053         }
3054         else
3055                 XLogFileName(lastRestartPointFname, 0, 0, 0);
3056
3057         /*
3058          * construct the command to be executed
3059          */
3060         dp = xlogRestoreCmd;
3061         endp = xlogRestoreCmd + MAXPGPATH - 1;
3062         *endp = '\0';
3063
3064         for (sp = recoveryRestoreCommand; *sp; sp++)
3065         {
3066                 if (*sp == '%')
3067                 {
3068                         switch (sp[1])
3069                         {
3070                                 case 'p':
3071                                         /* %p: relative path of target file */
3072                                         sp++;
3073                                         StrNCpy(dp, xlogpath, endp - dp);
3074                                         make_native_path(dp);
3075                                         dp += strlen(dp);
3076                                         break;
3077                                 case 'f':
3078                                         /* %f: filename of desired file */
3079                                         sp++;
3080                                         StrNCpy(dp, xlogfname, endp - dp);
3081                                         dp += strlen(dp);
3082                                         break;
3083                                 case 'r':
3084                                         /* %r: filename of last restartpoint */
3085                                         sp++;
3086                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
3087                                         dp += strlen(dp);
3088                                         break;
3089                                 case '%':
3090                                         /* convert %% to a single % */
3091                                         sp++;
3092                                         if (dp < endp)
3093                                                 *dp++ = *sp;
3094                                         break;
3095                                 default:
3096                                         /* otherwise treat the % as not special */
3097                                         if (dp < endp)
3098                                                 *dp++ = *sp;
3099                                         break;
3100                         }
3101                 }
3102                 else
3103                 {
3104                         if (dp < endp)
3105                                 *dp++ = *sp;
3106                 }
3107         }
3108         *dp = '\0';
3109
3110         ereport(DEBUG3,
3111                         (errmsg_internal("executing restore command \"%s\"",
3112                                                          xlogRestoreCmd)));
3113
3114         /*
3115          * Check signals before restore command and reset afterwards.
3116          */
3117         PreRestoreCommand();
3118
3119         /*
3120          * Copy xlog from archival storage to XLOGDIR
3121          */
3122         rc = system(xlogRestoreCmd);
3123
3124         PostRestoreCommand();
3125
3126         if (rc == 0)
3127         {
3128                 /*
3129                  * command apparently succeeded, but let's make sure the file is
3130                  * really there now and has the correct size.
3131                  */
3132                 if (stat(xlogpath, &stat_buf) == 0)
3133                 {
3134                         if (expectedSize > 0 && stat_buf.st_size != expectedSize)
3135                         {
3136                                 int                     elevel;
3137
3138                                 /*
3139                                  * If we find a partial file in standby mode, we assume it's
3140                                  * because it's just being copied to the archive, and keep
3141                                  * trying.
3142                                  *
3143                                  * Otherwise treat a wrong-sized file as FATAL to ensure the
3144                                  * DBA would notice it, but is that too strong? We could try
3145                                  * to plow ahead with a local copy of the file ... but the
3146                                  * problem is that there probably isn't one, and we'd
3147                                  * incorrectly conclude we've reached the end of WAL and we're
3148                                  * done recovering ...
3149                                  */
3150                                 if (StandbyMode && stat_buf.st_size < expectedSize)
3151                                         elevel = DEBUG1;
3152                                 else
3153                                         elevel = FATAL;
3154                                 ereport(elevel,
3155                                                 (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
3156                                                                 xlogfname,
3157                                                                 (unsigned long) stat_buf.st_size,
3158                                                                 (unsigned long) expectedSize)));
3159                                 return false;
3160                         }
3161                         else
3162                         {
3163                                 ereport(LOG,
3164                                                 (errmsg("restored log file \"%s\" from archive",
3165                                                                 xlogfname)));
3166                                 strcpy(path, xlogpath);
3167                                 return true;
3168                         }
3169                 }
3170                 else
3171                 {
3172                         /* stat failed */
3173                         if (errno != ENOENT)
3174                                 ereport(FATAL,
3175                                                 (errcode_for_file_access(),
3176                                                  errmsg("could not stat file \"%s\": %m",
3177                                                                 xlogpath)));
3178                 }
3179         }
3180
3181         /*
3182          * Remember, we rollforward UNTIL the restore fails so failure here is
3183          * just part of the process... that makes it difficult to determine
3184          * whether the restore failed because there isn't an archive to restore,
3185          * or because the administrator has specified the restore program
3186          * incorrectly.  We have to assume the former.
3187          *
3188          * However, if the failure was due to any sort of signal, it's best to
3189          * punt and abort recovery.  (If we "return false" here, upper levels will
3190          * assume that recovery is complete and start up the database!) It's
3191          * essential to abort on child SIGINT and SIGQUIT, because per spec
3192          * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
3193          * those it's a good bet we should have gotten it too.
3194          *
3195          * On SIGTERM, assume we have received a fast shutdown request, and exit
3196          * cleanly. It's pure chance whether we receive the SIGTERM first, or the
3197          * child process. If we receive it first, the signal handler will call
3198          * proc_exit, otherwise we do it here. If we or the child process received
3199          * SIGTERM for any other reason than a fast shutdown request, postmaster
3200          * will perform an immediate shutdown when it sees us exiting
3201          * unexpectedly.
3202          *
3203          * Per the Single Unix Spec, shells report exit status > 128 when a called
3204          * command died on a signal.  Also, 126 and 127 are used to report
3205          * problems such as an unfindable command; treat those as fatal errors
3206          * too.
3207          */
3208         if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
3209                 proc_exit(1);
3210
3211         signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
3212
3213         ereport(signaled ? FATAL : DEBUG2,
3214                 (errmsg("could not restore file \"%s\" from archive: return code %d",
3215                                 xlogfname, rc)));
3216
3217 not_available:
3218
3219         /*
3220          * if an archived file is not available, there might still be a version of
3221          * this file in XLOGDIR, so return that as the filename to open.
3222          *
3223          * In many recovery scenarios we expect this to fail also, but if so that
3224          * just means we've reached the end of WAL.
3225          */
3226         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3227         return false;
3228 }
3229
3230 /*
3231  * Attempt to execute an external shell command during recovery.
3232  *
3233  * 'command' is the shell command to be executed, 'commandName' is a
3234  * human-readable name describing the command emitted in the logs. If
3235  * 'failOnSignal' is true and the command is killed by a signal, a FATAL
3236  * error is thrown. Otherwise a WARNING is emitted.
3237  *
3238  * This is currently used for recovery_end_command and archive_cleanup_command.
3239  */
3240 static void
3241 ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal)
3242 {
3243         char            xlogRecoveryCmd[MAXPGPATH];
3244         char            lastRestartPointFname[MAXPGPATH];
3245         char       *dp;
3246         char       *endp;
3247         const char *sp;
3248         int                     rc;
3249         bool            signaled;
3250         uint32          restartLog;
3251         uint32          restartSeg;
3252
3253         Assert(command && commandName);
3254
3255         /*
3256          * Calculate the archive file cutoff point for use during log shipping
3257          * replication. All files earlier than this point can be deleted from the
3258          * archive, though there is no requirement to do so.
3259          */
3260         LWLockAcquire(ControlFileLock, LW_SHARED);
3261         XLByteToSeg(ControlFile->checkPointCopy.redo,
3262                                 restartLog, restartSeg);
3263         XLogFileName(lastRestartPointFname,
3264                                  ControlFile->checkPointCopy.ThisTimeLineID,
3265                                  restartLog, restartSeg);
3266         LWLockRelease(ControlFileLock);
3267
3268         /*
3269          * construct the command to be executed
3270          */
3271         dp = xlogRecoveryCmd;
3272         endp = xlogRecoveryCmd + MAXPGPATH - 1;
3273         *endp = '\0';
3274
3275         for (sp = command; *sp; sp++)
3276         {
3277                 if (*sp == '%')
3278                 {
3279                         switch (sp[1])
3280                         {
3281                                 case 'r':
3282                                         /* %r: filename of last restartpoint */
3283                                         sp++;
3284                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
3285                                         dp += strlen(dp);
3286                                         break;
3287                                 case '%':
3288                                         /* convert %% to a single % */
3289                                         sp++;
3290                                         if (dp < endp)
3291                                                 *dp++ = *sp;
3292                                         break;
3293                                 default:
3294                                         /* otherwise treat the % as not special */
3295                                         if (dp < endp)
3296                                                 *dp++ = *sp;
3297                                         break;
3298                         }
3299                 }
3300                 else
3301                 {
3302                         if (dp < endp)
3303                                 *dp++ = *sp;
3304                 }
3305         }
3306         *dp = '\0';
3307
3308         ereport(DEBUG3,
3309                         (errmsg_internal("executing %s \"%s\"", commandName, command)));
3310
3311         /*
3312          * execute the constructed command
3313          */
3314         rc = system(xlogRecoveryCmd);
3315         if (rc != 0)
3316         {
3317                 /*
3318                  * If the failure was due to any sort of signal, it's best to punt and
3319                  * abort recovery. See also detailed comments on signals in
3320                  * RestoreArchivedFile().
3321                  */
3322                 signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
3323
3324                 ereport((signaled && failOnSignal) ? FATAL : WARNING,
3325                 /*------
3326                    translator: First %s represents a recovery.conf parameter name like
3327                   "recovery_end_command", and the 2nd is the value of that parameter. */
3328                                 (errmsg("%s \"%s\": return code %d", commandName,
3329                                                 command, rc)));
3330         }
3331 }
3332
3333 /*
3334  * Preallocate log files beyond the specified log endpoint.
3335  *
3336  * XXX this is currently extremely conservative, since it forces only one
3337  * future log segment to exist, and even that only if we are 75% done with
3338  * the current one.  This is only appropriate for very low-WAL-volume systems.
3339  * High-volume systems will be OK once they've built up a sufficient set of
3340  * recycled log segments, but the startup transient is likely to include
3341  * a lot of segment creations by foreground processes, which is not so good.
3342  */
3343 static void
3344 PreallocXlogFiles(XLogRecPtr endptr)
3345 {
3346         uint32          _logId;
3347         uint32          _logSeg;
3348         int                     lf;
3349         bool            use_existent;
3350
3351         XLByteToPrevSeg(endptr, _logId, _logSeg);
3352         if ((endptr.xrecoff - 1) % XLogSegSize >=
3353                 (uint32) (0.75 * XLogSegSize))
3354         {
3355                 NextLogSeg(_logId, _logSeg);
3356                 use_existent = true;
3357                 lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
3358                 close(lf);
3359                 if (!use_existent)
3360                         CheckpointStats.ckpt_segs_added++;
3361         }
3362 }
3363
3364 /*
3365  * Get the log/seg of the latest removed or recycled WAL segment.
3366  * Returns 0/0 if no WAL segments have been removed since startup.
3367  */
3368 void
3369 XLogGetLastRemoved(uint32 *log, uint32 *seg)
3370 {
3371         /* use volatile pointer to prevent code rearrangement */
3372         volatile XLogCtlData *xlogctl = XLogCtl;
3373
3374         SpinLockAcquire(&xlogctl->info_lck);
3375         *log = xlogctl->lastRemovedLog;
3376         *seg = xlogctl->lastRemovedSeg;
3377         SpinLockRelease(&xlogctl->info_lck);
3378 }
3379
3380 /*
3381  * Update the last removed log/seg pointer in shared memory, to reflect
3382  * that the given XLOG file has been removed.
3383  */
3384 static void
3385 UpdateLastRemovedPtr(char *filename)
3386 {
3387         /* use volatile pointer to prevent code rearrangement */
3388         volatile XLogCtlData *xlogctl = XLogCtl;
3389         uint32          tli,
3390                                 log,
3391                                 seg;
3392
3393         XLogFromFileName(filename, &tli, &log, &seg);
3394
3395         SpinLockAcquire(&xlogctl->info_lck);
3396         if (log > xlogctl->lastRemovedLog ||
3397                 (log == xlogctl->lastRemovedLog && seg > xlogctl->lastRemovedSeg))
3398         {
3399                 xlogctl->lastRemovedLog = log;
3400                 xlogctl->lastRemovedSeg = seg;
3401         }
3402         SpinLockRelease(&xlogctl->info_lck);
3403 }
3404
3405 /*
3406  * Recycle or remove all log files older or equal to passed log/seg#
3407  *
3408  * endptr is current (or recent) end of xlog; this is used to determine
3409  * whether we want to recycle rather than delete no-longer-wanted log files.
3410  */
3411 static void
3412 RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
3413 {
3414         uint32          endlogId;
3415         uint32          endlogSeg;
3416         int                     max_advance;
3417         DIR                *xldir;
3418         struct dirent *xlde;
3419         char            lastoff[MAXFNAMELEN];
3420         char            path[MAXPGPATH];
3421
3422 #ifdef WIN32
3423         char            newpath[MAXPGPATH];
3424 #endif
3425         struct stat statbuf;
3426
3427         /*
3428          * Initialize info about where to try to recycle to.  We allow recycling
3429          * segments up to XLOGfileslop segments beyond the current XLOG location.
3430          */
3431         XLByteToPrevSeg(endptr, endlogId, endlogSeg);
3432         max_advance = XLOGfileslop;
3433
3434         xldir = AllocateDir(XLOGDIR);
3435         if (xldir == NULL)
3436                 ereport(ERROR,
3437                                 (errcode_for_file_access(),
3438                                  errmsg("could not open transaction log directory \"%s\": %m",
3439                                                 XLOGDIR)));
3440
3441         XLogFileName(lastoff, ThisTimeLineID, log, seg);
3442
3443         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3444                  lastoff);
3445
3446         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3447         {
3448                 /*
3449                  * We ignore the timeline part of the XLOG segment identifiers in
3450                  * deciding whether a segment is still needed.  This ensures that we
3451                  * won't prematurely remove a segment from a parent timeline. We could
3452                  * probably be a little more proactive about removing segments of
3453                  * non-parent timelines, but that would be a whole lot more
3454                  * complicated.
3455                  *
3456                  * We use the alphanumeric sorting property of the filenames to decide
3457                  * which ones are earlier than the lastoff segment.
3458                  */
3459                 if (strlen(xlde->d_name) == 24 &&
3460                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3461                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3462                 {
3463                         if (RecoveryInProgress() || XLogArchiveCheckDone(xlde->d_name))
3464                         {
3465                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3466
3467                                 /* Update the last removed location in shared memory first */
3468                                 UpdateLastRemovedPtr(xlde->d_name);
3469
3470                                 /*
3471                                  * Before deleting the file, see if it can be recycled as a
3472                                  * future log segment. Only recycle normal files, pg_standby
3473                                  * for example can create symbolic links pointing to a
3474                                  * separate archive directory.
3475                                  */
3476                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
3477                                         InstallXLogFileSegment(&endlogId, &endlogSeg, path,
3478                                                                                    true, &max_advance, true))
3479                                 {
3480                                         ereport(DEBUG2,
3481                                                         (errmsg("recycled transaction log file \"%s\"",
3482                                                                         xlde->d_name)));
3483                                         CheckpointStats.ckpt_segs_recycled++;
3484                                         /* Needn't recheck that slot on future iterations */
3485                                         if (max_advance > 0)
3486                                         {
3487                                                 NextLogSeg(endlogId, endlogSeg);
3488                                                 max_advance--;
3489                                         }
3490                                 }
3491                                 else
3492                                 {
3493                                         /* No need for any more future segments... */
3494                                         int                     rc;
3495
3496                                         ereport(DEBUG2,
3497                                                         (errmsg("removing transaction log file \"%s\"",
3498                                                                         xlde->d_name)));
3499
3500 #ifdef WIN32
3501
3502                                         /*
3503                                          * On Windows, if another process (e.g another backend)
3504                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
3505                                          * will succeed, but the file will still show up in
3506                                          * directory listing until the last handle is closed. To
3507                                          * avoid confusing the lingering deleted file for a live
3508                                          * WAL file that needs to be archived, rename it before
3509                                          * deleting it.
3510                                          *
3511                                          * If another process holds the file open without
3512                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
3513                                          * again at the next checkpoint.
3514                                          */
3515                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
3516                                         if (rename(path, newpath) != 0)
3517                                         {
3518                                                 ereport(LOG,
3519                                                                 (errcode_for_file_access(),
3520                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
3521                                                                                 path)));
3522                                                 continue;
3523                                         }
3524                                         rc = unlink(newpath);
3525 #else
3526                                         rc = unlink(path);
3527 #endif
3528                                         if (rc != 0)
3529                                         {
3530                                                 ereport(LOG,
3531                                                                 (errcode_for_file_access(),
3532                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
3533                                                                                 path)));
3534                                                 continue;
3535                                         }
3536                                         CheckpointStats.ckpt_segs_removed++;
3537                                 }
3538
3539                                 XLogArchiveCleanup(xlde->d_name);
3540                         }
3541                 }
3542         }
3543
3544         FreeDir(xldir);
3545 }
3546
3547 /*
3548  * Verify whether pg_xlog and pg_xlog/archive_status exist.
3549  * If the latter does not exist, recreate it.
3550  *
3551  * It is not the goal of this function to verify the contents of these
3552  * directories, but to help in cases where someone has performed a cluster
3553  * copy for PITR purposes but omitted pg_xlog from the copy.
3554  *
3555  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
3556  * policy decision was made not to.  It is fairly common for pg_xlog to be
3557  * a symlink, and if that was the DBA's intent then automatically making a
3558  * plain directory would result in degraded performance with no notice.
3559  */
3560 static void
3561 ValidateXLOGDirectoryStructure(void)
3562 {
3563         char            path[MAXPGPATH];
3564         struct stat stat_buf;
3565
3566         /* Check for pg_xlog; if it doesn't exist, error out */
3567         if (stat(XLOGDIR, &stat_buf) != 0 ||
3568                 !S_ISDIR(stat_buf.st_mode))
3569                 ereport(FATAL,
3570                                 (errmsg("required WAL directory \"%s\" does not exist",
3571                                                 XLOGDIR)));
3572
3573         /* Check for archive_status */
3574         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
3575         if (stat(path, &stat_buf) == 0)
3576         {
3577                 /* Check for weird cases where it exists but isn't a directory */
3578                 if (!S_ISDIR(stat_buf.st_mode))
3579                         ereport(FATAL,
3580                                         (errmsg("required WAL directory \"%s\" does not exist",
3581                                                         path)));
3582         }
3583         else
3584         {
3585                 ereport(LOG,
3586                                 (errmsg("creating missing WAL directory \"%s\"", path)));
3587                 if (mkdir(path, S_IRWXU) < 0)
3588                         ereport(FATAL,
3589                                         (errmsg("could not create missing directory \"%s\": %m",
3590                                                         path)));
3591         }
3592 }
3593
3594 /*
3595  * Remove previous backup history files.  This also retries creation of
3596  * .ready files for any backup history files for which XLogArchiveNotify
3597  * failed earlier.
3598  */
3599 static void
3600 CleanupBackupHistory(void)
3601 {
3602         DIR                *xldir;
3603         struct dirent *xlde;
3604         char            path[MAXPGPATH];
3605
3606         xldir = AllocateDir(XLOGDIR);
3607         if (xldir == NULL)
3608                 ereport(ERROR,
3609                                 (errcode_for_file_access(),
3610                                  errmsg("could not open transaction log directory \"%s\": %m",
3611                                                 XLOGDIR)));
3612
3613         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3614         {
3615                 if (strlen(xlde->d_name) > 24 &&
3616                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3617                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
3618                                    ".backup") == 0)
3619                 {
3620                         if (XLogArchiveCheckDone(xlde->d_name))
3621                         {
3622                                 ereport(DEBUG2,
3623                                 (errmsg("removing transaction log backup history file \"%s\"",
3624                                                 xlde->d_name)));
3625                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3626                                 unlink(path);
3627                                 XLogArchiveCleanup(xlde->d_name);
3628                         }
3629                 }
3630         }
3631
3632         FreeDir(xldir);
3633 }
3634
3635 /*
3636  * Restore the backup blocks present in an XLOG record, if any.
3637  *
3638  * We assume all of the record has been read into memory at *record.
3639  *
3640  * Note: when a backup block is available in XLOG, we restore it
3641  * unconditionally, even if the page in the database appears newer.
3642  * This is to protect ourselves against database pages that were partially
3643  * or incorrectly written during a crash.  We assume that the XLOG data
3644  * must be good because it has passed a CRC check, while the database
3645  * page might not be.  This will force us to replay all subsequent
3646  * modifications of the page that appear in XLOG, rather than possibly
3647  * ignoring them as already applied, but that's not a huge drawback.
3648  *
3649  * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
3650  * Otherwise, a normal exclusive lock is used.  During crash recovery, that's
3651  * just pro forma because there can't be any regular backends in the system,
3652  * but in hot standby mode the distinction is important. The 'cleanup'
3653  * argument applies to all backup blocks in the WAL record, that suffices for
3654  * now.
3655  */
3656 void
3657 RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
3658 {
3659         Buffer          buffer;
3660         Page            page;
3661         BkpBlock        bkpb;
3662         char       *blk;
3663         int                     i;
3664
3665         if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
3666                 return;
3667
3668         blk = (char *) XLogRecGetData(record) + record->xl_len;
3669         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3670         {
3671                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3672                         continue;
3673
3674                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3675                 blk += sizeof(BkpBlock);
3676
3677                 buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
3678                                                                                 RBM_ZERO);
3679                 Assert(BufferIsValid(buffer));
3680                 if (cleanup)
3681                         LockBufferForCleanup(buffer);
3682                 else
3683                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3684
3685                 page = (Page) BufferGetPage(buffer);
3686
3687                 if (bkpb.hole_length == 0)
3688                 {
3689                         memcpy((char *) page, blk, BLCKSZ);
3690                 }
3691                 else
3692                 {
3693                         memcpy((char *) page, blk, bkpb.hole_offset);
3694                         /* must zero-fill the hole */
3695                         MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
3696                         memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
3697                                    blk + bkpb.hole_offset,
3698                                    BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
3699                 }
3700
3701                 PageSetLSN(page, lsn);
3702                 PageSetTLI(page, ThisTimeLineID);
3703                 MarkBufferDirty(buffer);
3704                 UnlockReleaseBuffer(buffer);
3705
3706                 blk += BLCKSZ - bkpb.hole_length;
3707         }
3708 }
3709
3710 /*
3711  * CRC-check an XLOG record.  We do not believe the contents of an XLOG
3712  * record (other than to the minimal extent of computing the amount of
3713  * data to read in) until we've checked the CRCs.
3714  *
3715  * We assume all of the record has been read into memory at *record.
3716  */
3717 static bool
3718 RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
3719 {
3720         pg_crc32        crc;
3721         int                     i;
3722         uint32          len = record->xl_len;
3723         BkpBlock        bkpb;
3724         char       *blk;
3725
3726         /* First the rmgr data */
3727         INIT_CRC32(crc);
3728         COMP_CRC32(crc, XLogRecGetData(record), len);
3729
3730         /* Add in the backup blocks, if any */
3731         blk = (char *) XLogRecGetData(record) + len;
3732         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3733         {
3734                 uint32          blen;
3735
3736                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3737                         continue;
3738
3739                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3740                 if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
3741                 {
3742                         ereport(emode_for_corrupt_record(emode, recptr),
3743                                         (errmsg("incorrect hole size in record at %X/%X",
3744                                                         recptr.xlogid, recptr.xrecoff)));
3745                         return false;
3746                 }
3747                 blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
3748                 COMP_CRC32(crc, blk, blen);
3749                 blk += blen;
3750         }
3751
3752         /* Check that xl_tot_len agrees with our calculation */
3753         if (blk != (char *) record + record->xl_tot_len)
3754         {
3755                 ereport(emode_for_corrupt_record(emode, recptr),
3756                                 (errmsg("incorrect total length in record at %X/%X",
3757                                                 recptr.xlogid, recptr.xrecoff)));
3758                 return false;
3759         }
3760
3761         /* Finally include the record header */
3762         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
3763                            SizeOfXLogRecord - sizeof(pg_crc32));
3764         FIN_CRC32(crc);
3765
3766         if (!EQ_CRC32(record->xl_crc, crc))
3767         {
3768                 ereport(emode_for_corrupt_record(emode, recptr),
3769                 (errmsg("incorrect resource manager data checksum in record at %X/%X",
3770                                 recptr.xlogid, recptr.xrecoff)));
3771                 return false;
3772         }
3773
3774         return true;
3775 }
3776
3777 /*
3778  * Attempt to read an XLOG record.
3779  *
3780  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
3781  * try to read a record just after the last one previously read.
3782  *
3783  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3784  * (emode must be either PANIC, LOG)
3785  *
3786  * The record is copied into readRecordBuf, so that on successful return,
3787  * the returned record pointer always points there.
3788  */
3789 static XLogRecord *
3790 ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt)
3791 {
3792         XLogRecord *record;
3793         char       *buffer;
3794         XLogRecPtr      tmpRecPtr = EndRecPtr;
3795         bool            randAccess = false;
3796         uint32          len,
3797                                 total_len;
3798         uint32          targetRecOff;
3799         uint32          pageHeaderSize;
3800
3801         if (readBuf == NULL)
3802         {
3803                 /*
3804                  * First time through, permanently allocate readBuf.  We do it this
3805                  * way, rather than just making a static array, for two reasons: (1)
3806                  * no need to waste the storage in most instantiations of the backend;
3807                  * (2) a static char array isn't guaranteed to have any particular
3808                  * alignment, whereas malloc() will provide MAXALIGN'd storage.
3809                  */
3810                 readBuf = (char *) malloc(XLOG_BLCKSZ);
3811                 Assert(readBuf != NULL);
3812         }
3813
3814         if (RecPtr == NULL)
3815         {
3816                 RecPtr = &tmpRecPtr;
3817
3818                 /*
3819                  * RecPtr is pointing to end+1 of the previous WAL record.      We must
3820                  * advance it if necessary to where the next record starts.  First,
3821                  * align to next page if no more records can fit on the current page.
3822                  */
3823                 if (XLOG_BLCKSZ - (RecPtr->xrecoff % XLOG_BLCKSZ) < SizeOfXLogRecord)
3824                         NextLogPage(*RecPtr);
3825
3826                 /* Check for crossing of xlog segment boundary */
3827                 if (RecPtr->xrecoff >= XLogFileSize)
3828                 {
3829                         (RecPtr->xlogid)++;
3830                         RecPtr->xrecoff = 0;
3831                 }
3832
3833                 /*
3834                  * If at page start, we must skip over the page header.  But we can't
3835                  * do that until we've read in the page, since the header size is
3836                  * variable.
3837                  */
3838         }
3839         else
3840         {
3841                 /*
3842                  * In this case, the passed-in record pointer should already be
3843                  * pointing to a valid record starting position.
3844                  */
3845                 if (!XRecOffIsValid(RecPtr->xrecoff))
3846                         ereport(PANIC,
3847                                         (errmsg("invalid record offset at %X/%X",
3848                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3849
3850                 /*
3851                  * Since we are going to a random position in WAL, forget any prior
3852                  * state about what timeline we were in, and allow it to be any
3853                  * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
3854                  * to go backwards (but we can't reset that variable right here, since
3855                  * we might not change files at all).
3856                  */
3857                 lastPageTLI = 0;                /* see comment in ValidXLOGHeader */
3858                 randAccess = true;              /* allow curFileTLI to go backwards too */
3859         }
3860
3861         /* This is the first try to read this page. */
3862         failedSources = 0;
3863 retry:
3864         /* Read the page containing the record */
3865         if (!XLogPageRead(RecPtr, emode, fetching_ckpt, randAccess))
3866                 return NULL;
3867
3868         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3869         targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
3870         if (targetRecOff == 0)
3871         {
3872                 /*
3873                  * At page start, so skip over page header.  The Assert checks that
3874                  * we're not scribbling on caller's record pointer; it's OK because we
3875                  * can only get here in the continuing-from-prev-record case, since
3876                  * XRecOffIsValid rejected the zero-page-offset case otherwise.
3877                  */
3878                 Assert(RecPtr == &tmpRecPtr);
3879                 RecPtr->xrecoff += pageHeaderSize;
3880                 targetRecOff = pageHeaderSize;
3881         }
3882         else if (targetRecOff < pageHeaderSize)
3883         {
3884                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3885                                 (errmsg("invalid record offset at %X/%X",
3886                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3887                 goto next_record_is_invalid;
3888         }
3889         if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
3890                 targetRecOff == pageHeaderSize)
3891         {
3892                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3893                                 (errmsg("contrecord is requested by %X/%X",
3894                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3895                 goto next_record_is_invalid;
3896         }
3897         record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
3898
3899         /*
3900          * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
3901          * required.
3902          */
3903         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3904         {
3905                 if (record->xl_len != 0)
3906                 {
3907                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3908                                         (errmsg("invalid xlog switch record at %X/%X",
3909                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3910                         goto next_record_is_invalid;
3911                 }
3912         }
3913         else if (record->xl_len == 0)
3914         {
3915                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3916                                 (errmsg("record with zero length at %X/%X",
3917                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3918                 goto next_record_is_invalid;
3919         }
3920         if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
3921                 record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
3922                 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
3923         {
3924                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3925                                 (errmsg("invalid record length at %X/%X",
3926                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3927                 goto next_record_is_invalid;
3928         }
3929         if (record->xl_rmid > RM_MAX_ID)
3930         {
3931                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3932                                 (errmsg("invalid resource manager ID %u at %X/%X",
3933                                                 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
3934                 goto next_record_is_invalid;
3935         }
3936         if (randAccess)
3937         {
3938                 /*
3939                  * We can't exactly verify the prev-link, but surely it should be less
3940                  * than the record's own address.
3941                  */
3942                 if (!XLByteLT(record->xl_prev, *RecPtr))
3943                 {
3944                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3945                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3946                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3947                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3948                         goto next_record_is_invalid;
3949                 }
3950         }
3951         else
3952         {
3953                 /*
3954                  * Record's prev-link should exactly match our previous location. This
3955                  * check guards against torn WAL pages where a stale but valid-looking
3956                  * WAL record starts on a sector boundary.
3957                  */
3958                 if (!XLByteEQ(record->xl_prev, ReadRecPtr))
3959                 {
3960                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3961                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3962                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3963                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3964                         goto next_record_is_invalid;
3965                 }
3966         }
3967
3968         /*
3969          * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
3970          * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
3971          * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
3972          * enough for all "normal" records, but very large commit or abort records
3973          * might need more space.)
3974          */
3975         total_len = record->xl_tot_len;
3976         if (total_len > readRecordBufSize)
3977         {
3978                 uint32          newSize = total_len;
3979
3980                 newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
3981                 newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
3982                 if (readRecordBuf)
3983                         free(readRecordBuf);
3984                 readRecordBuf = (char *) malloc(newSize);
3985                 if (!readRecordBuf)
3986                 {
3987                         readRecordBufSize = 0;
3988                         /* We treat this as a "bogus data" condition */
3989                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3990                                         (errmsg("record length %u at %X/%X too long",
3991                                                         total_len, RecPtr->xlogid, RecPtr->xrecoff)));
3992                         goto next_record_is_invalid;
3993                 }
3994                 readRecordBufSize = newSize;
3995         }
3996
3997         buffer = readRecordBuf;
3998         len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
3999         if (total_len > len)
4000         {
4001                 /* Need to reassemble record */
4002                 XLogContRecord *contrecord;
4003                 XLogRecPtr      pagelsn;
4004                 uint32          gotlen = len;
4005
4006                 /* Initialize pagelsn to the beginning of the page this record is on */
4007                 pagelsn = *RecPtr;
4008                 pagelsn.xrecoff = (pagelsn.xrecoff / XLOG_BLCKSZ) * XLOG_BLCKSZ;
4009
4010                 memcpy(buffer, record, len);
4011                 record = (XLogRecord *) buffer;
4012                 buffer += len;
4013                 for (;;)
4014                 {
4015                         /* Calculate pointer to beginning of next page */
4016                         pagelsn.xrecoff += XLOG_BLCKSZ;
4017                         if (pagelsn.xrecoff >= XLogFileSize)
4018                         {
4019                                 (pagelsn.xlogid)++;
4020                                 pagelsn.xrecoff = 0;
4021                         }
4022                         /* Wait for the next page to become available */
4023                         if (!XLogPageRead(&pagelsn, emode, false, false))
4024                                 return NULL;
4025
4026                         /* Check that the continuation record looks valid */
4027                         if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
4028                         {
4029                                 ereport(emode_for_corrupt_record(emode, *RecPtr),
4030                                                 (errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
4031                                                                 readId, readSeg, readOff)));
4032                                 goto next_record_is_invalid;
4033                         }
4034                         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
4035                         contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
4036                         if (contrecord->xl_rem_len == 0 ||
4037                                 total_len != (contrecord->xl_rem_len + gotlen))
4038                         {
4039                                 ereport(emode_for_corrupt_record(emode, *RecPtr),
4040                                                 (errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
4041                                                                 contrecord->xl_rem_len,
4042                                                                 readId, readSeg, readOff)));
4043                                 goto next_record_is_invalid;
4044                         }
4045                         len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
4046                         if (contrecord->xl_rem_len > len)
4047                         {
4048                                 memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
4049                                 gotlen += len;
4050                                 buffer += len;
4051                                 continue;
4052                         }
4053                         memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
4054                                    contrecord->xl_rem_len);
4055                         break;
4056                 }
4057                 if (!RecordIsValid(record, *RecPtr, emode))
4058                         goto next_record_is_invalid;
4059                 pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
4060                 EndRecPtr.xlogid = readId;
4061                 EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
4062                         pageHeaderSize +
4063                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
4064
4065                 ReadRecPtr = *RecPtr;
4066                 /* needn't worry about XLOG SWITCH, it can't cross page boundaries */
4067                 return record;
4068         }
4069
4070         /* Record does not cross a page boundary */
4071         if (!RecordIsValid(record, *RecPtr, emode))
4072                 goto next_record_is_invalid;
4073         EndRecPtr.xlogid = RecPtr->xlogid;
4074         EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
4075
4076         ReadRecPtr = *RecPtr;
4077         memcpy(buffer, record, total_len);
4078
4079         /*
4080          * Special processing if it's an XLOG SWITCH record
4081          */
4082         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
4083         {
4084                 /* Pretend it extends to end of segment */
4085                 EndRecPtr.xrecoff += XLogSegSize - 1;
4086                 EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
4087
4088                 /*
4089                  * Pretend that readBuf contains the last page of the segment. This is
4090                  * just to avoid Assert failure in StartupXLOG if XLOG ends with this
4091                  * segment.
4092                  */
4093                 readOff = XLogSegSize - XLOG_BLCKSZ;
4094         }
4095         return (XLogRecord *) buffer;
4096
4097 next_record_is_invalid:
4098         failedSources |= readSource;
4099
4100         if (readFile >= 0)
4101         {
4102                 close(readFile);
4103                 readFile = -1;
4104         }
4105
4106         /* In standby-mode, keep trying */
4107         if (StandbyMode)
4108                 goto retry;
4109         else
4110                 return NULL;
4111 }
4112
4113 /*
4114  * Check whether the xlog header of a page just read in looks valid.
4115  *
4116  * This is just a convenience subroutine to avoid duplicated code in
4117  * ReadRecord.  It's not intended for use from anywhere else.
4118  */
4119 static bool
4120 ValidXLOGHeader(XLogPageHeader hdr, int emode)
4121 {
4122         XLogRecPtr      recaddr;
4123
4124         recaddr.xlogid = readId;
4125         recaddr.xrecoff = readSeg * XLogSegSize + readOff;
4126
4127         if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
4128         {
4129                 ereport(emode_for_corrupt_record(emode, recaddr),
4130                                 (errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
4131                                                 hdr->xlp_magic, readId, readSeg, readOff)));
4132                 return false;
4133         }
4134         if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
4135         {
4136                 ereport(emode_for_corrupt_record(emode, recaddr),
4137                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
4138                                                 hdr->xlp_info, readId, readSeg, readOff)));
4139                 return false;
4140         }
4141         if (hdr->xlp_info & XLP_LONG_HEADER)
4142         {
4143                 XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
4144
4145                 if (longhdr->xlp_sysid != ControlFile->system_identifier)
4146                 {
4147                         char            fhdrident_str[32];
4148                         char            sysident_str[32];
4149
4150                         /*
4151                          * Format sysids separately to keep platform-dependent format code
4152                          * out of the translatable message string.
4153                          */
4154                         snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
4155                                          longhdr->xlp_sysid);
4156                         snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
4157                                          ControlFile->system_identifier);
4158                         ereport(emode_for_corrupt_record(emode, recaddr),
4159                                         (errmsg("WAL file is from different database system"),
4160                                          errdetail("WAL file database system identifier is %s, pg_control database system identifier is %s.",
4161                                                            fhdrident_str, sysident_str)));
4162                         return false;
4163                 }
4164                 if (longhdr->xlp_seg_size != XLogSegSize)
4165                 {
4166                         ereport(emode_for_corrupt_record(emode, recaddr),
4167                                         (errmsg("WAL file is from different database system"),
4168                                          errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
4169                         return false;
4170                 }
4171                 if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
4172                 {
4173                         ereport(emode_for_corrupt_record(emode, recaddr),
4174                                         (errmsg("WAL file is from different database system"),
4175                                          errdetail("Incorrect XLOG_BLCKSZ in page header.")));
4176                         return false;
4177                 }
4178         }
4179         else if (readOff == 0)
4180         {
4181                 /* hmm, first page of file doesn't have a long header? */
4182                 ereport(emode_for_corrupt_record(emode, recaddr),
4183                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
4184                                                 hdr->xlp_info, readId, readSeg, readOff)));
4185                 return false;
4186         }
4187
4188         if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
4189         {
4190                 ereport(emode_for_corrupt_record(emode, recaddr),
4191                                 (errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
4192                                                 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
4193                                                 readId, readSeg, readOff)));
4194                 return false;
4195         }
4196
4197         /*
4198          * Check page TLI is one of the expected values.
4199          */
4200         if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
4201         {
4202                 ereport(emode_for_corrupt_record(emode, recaddr),
4203                                 (errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
4204                                                 hdr->xlp_tli,
4205                                                 readId, readSeg, readOff)));
4206                 return false;
4207         }
4208
4209         /*
4210          * Since child timelines are always assigned a TLI greater than their
4211          * immediate parent's TLI, we should never see TLI go backwards across
4212          * successive pages of a consistent WAL sequence.
4213          *
4214          * Of course this check should only be applied when advancing sequentially
4215          * across pages; therefore ReadRecord resets lastPageTLI to zero when
4216          * going to a random page.
4217          */
4218         if (hdr->xlp_tli < lastPageTLI)
4219         {
4220                 ereport(emode_for_corrupt_record(emode, recaddr),
4221                                 (errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
4222                                                 hdr->xlp_tli, lastPageTLI,
4223                                                 readId, readSeg, readOff)));
4224                 return false;
4225         }
4226         lastPageTLI = hdr->xlp_tli;
4227         return true;
4228 }
4229
4230 /*
4231  * Try to read a timeline's history file.
4232  *
4233  * If successful, return the list of component TLIs (the given TLI followed by
4234  * its ancestor TLIs).  If we can't find the history file, assume that the
4235  * timeline has no parents, and return a list of just the specified timeline
4236  * ID.
4237  */
4238 static List *
4239 readTimeLineHistory(TimeLineID targetTLI)
4240 {
4241         List       *result;
4242         char            path[MAXPGPATH];
4243         char            histfname[MAXFNAMELEN];
4244         char            fline[MAXPGPATH];
4245         FILE       *fd;
4246
4247         /* Timeline 1 does not have a history file, so no need to check */
4248         if (targetTLI == 1)
4249                 return list_make1_int((int) targetTLI);
4250
4251         if (InArchiveRecovery)
4252         {
4253                 TLHistoryFileName(histfname, targetTLI);
4254                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4255         }
4256         else
4257                 TLHistoryFilePath(path, targetTLI);
4258
4259         fd = AllocateFile(path, "r");
4260         if (fd == NULL)
4261         {
4262                 if (errno != ENOENT)
4263                         ereport(FATAL,
4264                                         (errcode_for_file_access(),
4265                                          errmsg("could not open file \"%s\": %m", path)));
4266                 /* Not there, so assume no parents */
4267                 return list_make1_int((int) targetTLI);
4268         }
4269
4270         result = NIL;
4271
4272         /*
4273          * Parse the file...
4274          */
4275         while (fgets(fline, sizeof(fline), fd) != NULL)
4276         {
4277                 /* skip leading whitespace and check for # comment */
4278                 char       *ptr;
4279                 char       *endptr;
4280                 TimeLineID      tli;
4281
4282                 for (ptr = fline; *ptr; ptr++)
4283                 {
4284                         if (!isspace((unsigned char) *ptr))
4285                                 break;
4286                 }
4287                 if (*ptr == '\0' || *ptr == '#')
4288                         continue;
4289
4290                 /* expect a numeric timeline ID as first field of line */
4291                 tli = (TimeLineID) strtoul(ptr, &endptr, 0);
4292                 if (endptr == ptr)
4293                         ereport(FATAL,
4294                                         (errmsg("syntax error in history file: %s", fline),
4295                                          errhint("Expected a numeric timeline ID.")));
4296
4297                 if (result &&
4298                         tli <= (TimeLineID) linitial_int(result))
4299                         ereport(FATAL,
4300                                         (errmsg("invalid data in history file: %s", fline),
4301                                    errhint("Timeline IDs must be in increasing sequence.")));
4302
4303                 /* Build list with newest item first */
4304                 result = lcons_int((int) tli, result);
4305
4306                 /* we ignore the remainder of each line */
4307         }
4308
4309         FreeFile(fd);
4310
4311         if (result &&
4312                 targetTLI <= (TimeLineID) linitial_int(result))
4313                 ereport(FATAL,
4314                                 (errmsg("invalid data in history file \"%s\"", path),
4315                         errhint("Timeline IDs must be less than child timeline's ID.")));
4316
4317         result = lcons_int((int) targetTLI, result);
4318
4319         ereport(DEBUG3,
4320                         (errmsg_internal("history of timeline %u is %s",
4321                                                          targetTLI, nodeToString(result))));
4322
4323         return result;
4324 }
4325
4326 /*
4327  * Probe whether a timeline history file exists for the given timeline ID
4328  */
4329 static bool
4330 existsTimeLineHistory(TimeLineID probeTLI)
4331 {
4332         char            path[MAXPGPATH];
4333         char            histfname[MAXFNAMELEN];
4334         FILE       *fd;
4335
4336         /* Timeline 1 does not have a history file, so no need to check */
4337         if (probeTLI == 1)
4338                 return false;
4339
4340         if (InArchiveRecovery)
4341         {
4342                 TLHistoryFileName(histfname, probeTLI);
4343                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4344         }
4345         else
4346                 TLHistoryFilePath(path, probeTLI);
4347
4348         fd = AllocateFile(path, "r");
4349         if (fd != NULL)
4350         {
4351                 FreeFile(fd);
4352                 return true;
4353         }
4354         else
4355         {
4356                 if (errno != ENOENT)
4357                         ereport(FATAL,
4358                                         (errcode_for_file_access(),
4359                                          errmsg("could not open file \"%s\": %m", path)));
4360                 return false;
4361         }
4362 }
4363
4364 /*
4365  * Scan for new timelines that might have appeared in the archive since we
4366  * started recovery.
4367  *
4368  * If there are any, the function changes recovery target TLI to the latest
4369  * one and returns 'true'.
4370  */
4371 static bool
4372 rescanLatestTimeLine(void)
4373 {
4374         TimeLineID      newtarget;
4375
4376         newtarget = findNewestTimeLine(recoveryTargetTLI);
4377         if (newtarget != recoveryTargetTLI)
4378         {
4379                 /*
4380                  * Determine the list of expected TLIs for the new TLI
4381                  */
4382                 List       *newExpectedTLIs;
4383
4384                 newExpectedTLIs = readTimeLineHistory(newtarget);
4385
4386                 /*
4387                  * If the current timeline is not part of the history of the new
4388                  * timeline, we cannot proceed to it.
4389                  *
4390                  * XXX This isn't foolproof: The new timeline might have forked from
4391                  * the current one, but before the current recovery location. In that
4392                  * case we will still switch to the new timeline and proceed replaying
4393                  * from it even though the history doesn't match what we already
4394                  * replayed. That's not good. We will likely notice at the next online
4395                  * checkpoint, as the TLI won't match what we expected, but it's not
4396                  * guaranteed. The admin needs to make sure that doesn't happen.
4397                  */
4398                 if (!list_member_int(newExpectedTLIs,
4399                                                          (int) recoveryTargetTLI))
4400                         ereport(LOG,
4401                                         (errmsg("new timeline %u is not a child of database system timeline %u",
4402                                                         newtarget,
4403                                                         ThisTimeLineID)));
4404                 else
4405                 {
4406                         /* Switch target */
4407                         recoveryTargetTLI = newtarget;
4408                         list_free(expectedTLIs);
4409                         expectedTLIs = newExpectedTLIs;
4410
4411                         XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;
4412
4413                         ereport(LOG,
4414                                         (errmsg("new target timeline is %u",
4415                                                         recoveryTargetTLI)));
4416                         return true;
4417                 }
4418         }
4419         return false;
4420 }
4421
4422 /*
4423  * Find the newest existing timeline, assuming that startTLI exists.
4424  *
4425  * Note: while this is somewhat heuristic, it does positively guarantee
4426  * that (result + 1) is not a known timeline, and therefore it should
4427  * be safe to assign that ID to a new timeline.
4428  */
4429 static TimeLineID
4430 findNewestTimeLine(TimeLineID startTLI)
4431 {
4432         TimeLineID      newestTLI;
4433         TimeLineID      probeTLI;
4434
4435         /*
4436          * The algorithm is just to probe for the existence of timeline history
4437          * files.  XXX is it useful to allow gaps in the sequence?
4438          */
4439         newestTLI = startTLI;
4440
4441         for (probeTLI = startTLI + 1;; probeTLI++)
4442         {
4443                 if (existsTimeLineHistory(probeTLI))
4444                 {
4445                         newestTLI = probeTLI;           /* probeTLI exists */
4446                 }
4447                 else
4448                 {
4449                         /* doesn't exist, assume we're done */
4450                         break;
4451                 }
4452         }
4453
4454         return newestTLI;
4455 }
4456
4457 /*
4458  * Create a new timeline history file.
4459  *
4460  *      newTLI: ID of the new timeline
4461  *      parentTLI: ID of its immediate parent
4462  *      endTLI et al: ID of the last used WAL file, for annotation purposes
4463  *
4464  * Currently this is only used during recovery, and so there are no locking
4465  * considerations.      But we should be just as tense as XLogFileInit to avoid
4466  * emplacing a bogus file.
4467  */
4468 static void
4469 writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
4470                                          TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
4471 {
4472         char            path[MAXPGPATH];
4473         char            tmppath[MAXPGPATH];
4474         char            histfname[MAXFNAMELEN];
4475         char            xlogfname[MAXFNAMELEN];
4476         char            buffer[BLCKSZ];
4477         int                     srcfd;
4478         int                     fd;
4479         int                     nbytes;
4480
4481         Assert(newTLI > parentTLI); /* else bad selection of newTLI */
4482
4483         /*
4484          * Write into a temp file name.
4485          */
4486         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
4487
4488         unlink(tmppath);
4489
4490         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
4491         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
4492                                            S_IRUSR | S_IWUSR);
4493         if (fd < 0)
4494                 ereport(ERROR,
4495                                 (errcode_for_file_access(),
4496                                  errmsg("could not create file \"%s\": %m", tmppath)));
4497
4498         /*
4499          * If a history file exists for the parent, copy it verbatim
4500          */
4501         if (InArchiveRecovery)
4502         {
4503                 TLHistoryFileName(histfname, parentTLI);
4504                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4505         }
4506         else
4507                 TLHistoryFilePath(path, parentTLI);
4508
4509         srcfd = BasicOpenFile(path, O_RDONLY, 0);
4510         if (srcfd < 0)
4511         {
4512                 if (errno != ENOENT)
4513                         ereport(ERROR,
4514                                         (errcode_for_file_access(),
4515                                          errmsg("could not open file \"%s\": %m", path)));
4516                 /* Not there, so assume parent has no parents */
4517         }
4518         else
4519         {
4520                 for (;;)
4521                 {
4522                         errno = 0;
4523                         nbytes = (int) read(srcfd, buffer, sizeof(buffer));
4524                         if (nbytes < 0 || errno != 0)
4525                                 ereport(ERROR,
4526                                                 (errcode_for_file_access(),
4527                                                  errmsg("could not read file \"%s\": %m", path)));
4528                         if (nbytes == 0)
4529                                 break;
4530                         errno = 0;
4531                         if ((int) write(fd, buffer, nbytes) != nbytes)
4532                         {
4533                                 int                     save_errno = errno;
4534
4535                                 /*
4536                                  * If we fail to make the file, delete it to release disk
4537                                  * space
4538                                  */
4539                                 unlink(tmppath);
4540
4541                                 /*
4542                                  * if write didn't set errno, assume problem is no disk space
4543                                  */
4544                                 errno = save_errno ? save_errno : ENOSPC;
4545
4546                                 ereport(ERROR,
4547                                                 (errcode_for_file_access(),
4548                                          errmsg("could not write to file \"%s\": %m", tmppath)));
4549                         }
4550                 }
4551                 close(srcfd);
4552         }
4553
4554         /*
4555          * Append one line with the details of this timeline split.
4556          *
4557          * If we did have a parent file, insert an extra newline just in case the
4558          * parent file failed to end with one.
4559          */
4560         XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);
4561
4562         /*
4563          * Write comment to history file to explain why and where timeline
4564          * changed. Comment varies according to the recovery target used.
4565          */
4566         if (recoveryTarget == RECOVERY_TARGET_XID)
4567                 snprintf(buffer, sizeof(buffer),
4568                                  "%s%u\t%s\t%s transaction %u\n",
4569                                  (srcfd < 0) ? "" : "\n",
4570                                  parentTLI,
4571                                  xlogfname,
4572                                  recoveryStopAfter ? "after" : "before",
4573                                  recoveryStopXid);
4574         else if (recoveryTarget == RECOVERY_TARGET_TIME)
4575                 snprintf(buffer, sizeof(buffer),
4576                                  "%s%u\t%s\t%s %s\n",
4577                                  (srcfd < 0) ? "" : "\n",
4578                                  parentTLI,
4579                                  xlogfname,
4580                                  recoveryStopAfter ? "after" : "before",
4581                                  timestamptz_to_str(recoveryStopTime));
4582         else if (recoveryTarget == RECOVERY_TARGET_NAME)
4583                 snprintf(buffer, sizeof(buffer),
4584                                  "%s%u\t%s\tat restore point \"%s\"\n",
4585                                  (srcfd < 0) ? "" : "\n",
4586                                  parentTLI,
4587                                  xlogfname,
4588                                  recoveryStopName);
4589         else
4590                 snprintf(buffer, sizeof(buffer),
4591                                  "%s%u\t%s\tno recovery target specified\n",
4592                                  (srcfd < 0) ? "" : "\n",
4593                                  parentTLI,
4594                                  xlogfname);
4595
4596         nbytes = strlen(buffer);
4597         errno = 0;
4598         if ((int) write(fd, buffer, nbytes) != nbytes)
4599         {
4600                 int                     save_errno = errno;
4601
4602                 /*
4603                  * If we fail to make the file, delete it to release disk space
4604                  */
4605                 unlink(tmppath);
4606                 /* if write didn't set errno, assume problem is no disk space */
4607                 errno = save_errno ? save_errno : ENOSPC;
4608
4609                 ereport(ERROR,
4610                                 (errcode_for_file_access(),
4611                                  errmsg("could not write to file \"%s\": %m", tmppath)));
4612         }
4613
4614         if (pg_fsync(fd) != 0)
4615                 ereport(ERROR,
4616                                 (errcode_for_file_access(),
4617                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
4618
4619         if (close(fd))
4620                 ereport(ERROR,
4621                                 (errcode_for_file_access(),
4622                                  errmsg("could not close file \"%s\": %m", tmppath)));
4623
4624
4625         /*
4626          * Now move the completed history file into place with its final name.
4627          */
4628         TLHistoryFilePath(path, newTLI);
4629
4630         /*
4631          * Prefer link() to rename() here just to be really sure that we don't
4632          * overwrite an existing logfile.  However, there shouldn't be one, so
4633          * rename() is an acceptable substitute except for the truly paranoid.
4634          */
4635 #if HAVE_WORKING_LINK
4636         if (link(tmppath, path) < 0)
4637                 ereport(ERROR,
4638                                 (errcode_for_file_access(),
4639                                  errmsg("could not link file \"%s\" to \"%s\": %m",
4640                                                 tmppath, path)));
4641         unlink(tmppath);
4642 #else
4643         if (rename(tmppath, path) < 0)
4644                 ereport(ERROR,
4645                                 (errcode_for_file_access(),
4646                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
4647                                                 tmppath, path)));
4648 #endif
4649
4650         /* The history file can be archived immediately. */
4651         TLHistoryFileName(histfname, newTLI);
4652         XLogArchiveNotify(histfname);
4653 }
4654
4655 /*
4656  * I/O routines for pg_control
4657  *
4658  * *ControlFile is a buffer in shared memory that holds an image of the
4659  * contents of pg_control.      WriteControlFile() initializes pg_control
4660  * given a preloaded buffer, ReadControlFile() loads the buffer from
4661  * the pg_control file (during postmaster or standalone-backend startup),
4662  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4663  *
4664  * For simplicity, WriteControlFile() initializes the fields of pg_control
4665  * that are related to checking backend/database compatibility, and
4666  * ReadControlFile() verifies they are correct.  We could split out the
4667  * I/O and compatibility-check functions, but there seems no need currently.
4668  */
4669 static void
4670 WriteControlFile(void)
4671 {
4672         int                     fd;
4673         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4674
4675         /*
4676          * Initialize version and compatibility-check fields
4677          */
4678         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4679         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4680
4681         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4682         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4683
4684         ControlFile->blcksz = BLCKSZ;
4685         ControlFile->relseg_size = RELSEG_SIZE;
4686         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4687         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4688
4689         ControlFile->nameDataLen = NAMEDATALEN;
4690         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4691
4692         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4693
4694 #ifdef HAVE_INT64_TIMESTAMP
4695         ControlFile->enableIntTimes = true;
4696 #else
4697         ControlFile->enableIntTimes = false;
4698 #endif
4699         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4700         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4701
4702         /* Contents are protected with a CRC */
4703         INIT_CRC32(ControlFile->crc);
4704         COMP_CRC32(ControlFile->crc,
4705                            (char *) ControlFile,
4706                            offsetof(ControlFileData, crc));
4707         FIN_CRC32(ControlFile->crc);
4708
4709         /*
4710          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4711          * excess over sizeof(ControlFileData).  This reduces the odds of
4712          * premature-EOF errors when reading pg_control.  We'll still fail when we
4713          * check the contents of the file, but hopefully with a more specific
4714          * error than "couldn't read pg_control".
4715          */
4716         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4717                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4718
4719         memset(buffer, 0, PG_CONTROL_SIZE);
4720         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4721
4722         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4723                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4724                                            S_IRUSR | S_IWUSR);
4725         if (fd < 0)
4726                 ereport(PANIC,
4727                                 (errcode_for_file_access(),
4728                                  errmsg("could not create control file \"%s\": %m",
4729                                                 XLOG_CONTROL_FILE)));
4730
4731         errno = 0;
4732         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4733         {
4734                 /* if write didn't set errno, assume problem is no disk space */
4735                 if (errno == 0)
4736                         errno = ENOSPC;
4737                 ereport(PANIC,
4738                                 (errcode_for_file_access(),
4739                                  errmsg("could not write to control file: %m")));
4740         }
4741
4742         if (pg_fsync(fd) != 0)
4743                 ereport(PANIC,
4744                                 (errcode_for_file_access(),
4745                                  errmsg("could not fsync control file: %m")));
4746
4747         if (close(fd))
4748                 ereport(PANIC,
4749                                 (errcode_for_file_access(),
4750                                  errmsg("could not close control file: %m")));
4751 }
4752
4753 static void
4754 ReadControlFile(void)
4755 {
4756         pg_crc32        crc;
4757         int                     fd;
4758
4759         /*
4760          * Read data...
4761          */
4762         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4763                                            O_RDWR | PG_BINARY,
4764                                            S_IRUSR | S_IWUSR);
4765         if (fd < 0)
4766                 ereport(PANIC,
4767                                 (errcode_for_file_access(),
4768                                  errmsg("could not open control file \"%s\": %m",
4769                                                 XLOG_CONTROL_FILE)));
4770
4771         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4772                 ereport(PANIC,
4773                                 (errcode_for_file_access(),
4774                                  errmsg("could not read from control file: %m")));
4775
4776         close(fd);
4777
4778         /*
4779          * Check for expected pg_control format version.  If this is wrong, the
4780          * CRC check will likely fail because we'll be checking the wrong number
4781          * of bytes.  Complaining about wrong version will probably be more
4782          * enlightening than complaining about wrong CRC.
4783          */
4784
4785         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4786                 ereport(FATAL,
4787                                 (errmsg("database files are incompatible with server"),
4788                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4789                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4790                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4791                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4792                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4793
4794         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4795                 ereport(FATAL,
4796                                 (errmsg("database files are incompatible with server"),
4797                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4798                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4799                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4800                                  errhint("It looks like you need to initdb.")));
4801
4802         /* Now check the CRC. */
4803         INIT_CRC32(crc);
4804         COMP_CRC32(crc,
4805                            (char *) ControlFile,
4806                            offsetof(ControlFileData, crc));
4807         FIN_CRC32(crc);
4808
4809         if (!EQ_CRC32(crc, ControlFile->crc))
4810                 ereport(FATAL,
4811                                 (errmsg("incorrect checksum in control file")));
4812
4813         /*
4814          * Do compatibility checking immediately.  If the database isn't
4815          * compatible with the backend executable, we want to abort before we can
4816          * possibly do any damage.
4817          */
4818         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4819                 ereport(FATAL,
4820                                 (errmsg("database files are incompatible with server"),
4821                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4822                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4823                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4824                                  errhint("It looks like you need to initdb.")));
4825         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4826                 ereport(FATAL,
4827                                 (errmsg("database files are incompatible with server"),
4828                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4829                                          " but the server was compiled with MAXALIGN %d.",
4830                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4831                                  errhint("It looks like you need to initdb.")));
4832         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4833                 ereport(FATAL,
4834                                 (errmsg("database files are incompatible with server"),
4835                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4836                                  errhint("It looks like you need to initdb.")));
4837         if (ControlFile->blcksz != BLCKSZ)
4838                 ereport(FATAL,
4839                                 (errmsg("database files are incompatible with server"),
4840                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4841                                            " but the server was compiled with BLCKSZ %d.",
4842                                            ControlFile->blcksz, BLCKSZ),
4843                                  errhint("It looks like you need to recompile or initdb.")));
4844         if (ControlFile->relseg_size != RELSEG_SIZE)
4845                 ereport(FATAL,
4846                                 (errmsg("database files are incompatible with server"),
4847                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4848                                   " but the server was compiled with RELSEG_SIZE %d.",
4849                                   ControlFile->relseg_size, RELSEG_SIZE),
4850                                  errhint("It looks like you need to recompile or initdb.")));
4851         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4852                 ereport(FATAL,
4853                                 (errmsg("database files are incompatible with server"),
4854                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4855                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4856                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4857                                  errhint("It looks like you need to recompile or initdb.")));
4858         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4859                 ereport(FATAL,
4860                                 (errmsg("database files are incompatible with server"),
4861                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4862                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4863                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4864                                  errhint("It looks like you need to recompile or initdb.")));
4865         if (ControlFile->nameDataLen != NAMEDATALEN)
4866                 ereport(FATAL,
4867                                 (errmsg("database files are incompatible with server"),
4868                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4869                                   " but the server was compiled with NAMEDATALEN %d.",
4870                                   ControlFile->nameDataLen, NAMEDATALEN),
4871                                  errhint("It looks like you need to recompile or initdb.")));
4872         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4873                 ereport(FATAL,
4874                                 (errmsg("database files are incompatible with server"),
4875                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4876                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4877                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4878                                  errhint("It looks like you need to recompile or initdb.")));
4879         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4880                 ereport(FATAL,
4881                                 (errmsg("database files are incompatible with server"),
4882                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4883                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4884                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4885                                  errhint("It looks like you need to recompile or initdb.")));
4886
4887 #ifdef HAVE_INT64_TIMESTAMP
4888         if (ControlFile->enableIntTimes != true)
4889                 ereport(FATAL,
4890                                 (errmsg("database files are incompatible with server"),
4891                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4892                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4893                                  errhint("It looks like you need to recompile or initdb.")));
4894 #else
4895         if (ControlFile->enableIntTimes != false)
4896                 ereport(FATAL,
4897                                 (errmsg("database files are incompatible with server"),
4898                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4899                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4900                                  errhint("It looks like you need to recompile or initdb.")));
4901 #endif
4902
4903 #ifdef USE_FLOAT4_BYVAL
4904         if (ControlFile->float4ByVal != true)
4905                 ereport(FATAL,
4906                                 (errmsg("database files are incompatible with server"),
4907                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4908                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4909                                  errhint("It looks like you need to recompile or initdb.")));
4910 #else
4911         if (ControlFile->float4ByVal != false)
4912                 ereport(FATAL,
4913                                 (errmsg("database files are incompatible with server"),
4914                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4915                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4916                                  errhint("It looks like you need to recompile or initdb.")));
4917 #endif
4918
4919 #ifdef USE_FLOAT8_BYVAL
4920         if (ControlFile->float8ByVal != true)
4921                 ereport(FATAL,
4922                                 (errmsg("database files are incompatible with server"),
4923                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4924                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4925                                  errhint("It looks like you need to recompile or initdb.")));
4926 #else
4927         if (ControlFile->float8ByVal != false)
4928                 ereport(FATAL,
4929                                 (errmsg("database files are incompatible with server"),
4930                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4931                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4932                                  errhint("It looks like you need to recompile or initdb.")));
4933 #endif
4934 }
4935
4936 void
4937 UpdateControlFile(void)
4938 {
4939         int                     fd;
4940
4941         INIT_CRC32(ControlFile->crc);
4942         COMP_CRC32(ControlFile->crc,
4943                            (char *) ControlFile,
4944                            offsetof(ControlFileData, crc));
4945         FIN_CRC32(ControlFile->crc);
4946
4947         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4948                                            O_RDWR | PG_BINARY,
4949                                            S_IRUSR | S_IWUSR);
4950         if (fd < 0)
4951                 ereport(PANIC,
4952                                 (errcode_for_file_access(),
4953                                  errmsg("could not open control file \"%s\": %m",
4954                                                 XLOG_CONTROL_FILE)));
4955
4956         errno = 0;
4957         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4958         {
4959                 /* if write didn't set errno, assume problem is no disk space */
4960                 if (errno == 0)
4961                         errno = ENOSPC;
4962                 ereport(PANIC,
4963                                 (errcode_for_file_access(),
4964                                  errmsg("could not write to control file: %m")));
4965         }
4966
4967         if (pg_fsync(fd) != 0)
4968                 ereport(PANIC,
4969                                 (errcode_for_file_access(),
4970                                  errmsg("could not fsync control file: %m")));
4971
4972         if (close(fd))
4973                 ereport(PANIC,
4974                                 (errcode_for_file_access(),
4975                                  errmsg("could not close control file: %m")));
4976 }
4977
4978 /*
4979  * Returns the unique system identifier from control file.
4980  */
4981 uint64
4982 GetSystemIdentifier(void)
4983 {
4984         Assert(ControlFile != NULL);
4985         return ControlFile->system_identifier;
4986 }
4987
4988 /*
4989  * Auto-tune the number of XLOG buffers.
4990  *
4991  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4992  * a maximum of one XLOG segment (there is little reason to think that more
4993  * is helpful, at least so long as we force an fsync when switching log files)
4994  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4995  * 9.1, when auto-tuning was added).
4996  *
4997  * This should not be called until NBuffers has received its final value.
4998  */
4999 static int
5000 XLOGChooseNumBuffers(void)
5001 {
5002         int                     xbuffers;
5003
5004         xbuffers = NBuffers / 32;
5005         if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
5006                 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
5007         if (xbuffers < 8)
5008                 xbuffers = 8;
5009         return xbuffers;
5010 }
5011
5012 /*
5013  * GUC check_hook for wal_buffers
5014  */
5015 bool
5016 check_wal_buffers(int *newval, void **extra, GucSource source)
5017 {
5018         /*
5019          * -1 indicates a request for auto-tune.
5020          */
5021         if (*newval == -1)
5022         {
5023                 /*
5024                  * If we haven't yet changed the boot_val default of -1, just let it
5025                  * be.  We'll fix it when XLOGShmemSize is called.
5026                  */
5027                 if (XLOGbuffers == -1)
5028                         return true;
5029
5030                 /* Otherwise, substitute the auto-tune value */
5031                 *newval = XLOGChooseNumBuffers();
5032         }
5033
5034         /*
5035          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
5036          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
5037          * the case, we just silently treat such values as a request for the
5038          * minimum.  (We could throw an error instead, but that doesn't seem very
5039          * helpful.)
5040          */
5041         if (*newval < 4)
5042                 *newval = 4;
5043
5044         return true;
5045 }
5046
5047 /*
5048  * Initialization of shared memory for XLOG
5049  */
5050 Size
5051 XLOGShmemSize(void)
5052 {
5053         Size            size;
5054
5055         /*
5056          * If the value of wal_buffers is -1, use the preferred auto-tune value.
5057          * This isn't an amazingly clean place to do this, but we must wait till
5058          * NBuffers has received its final value, and must do it before using the
5059          * value of XLOGbuffers to do anything important.
5060          */
5061         if (XLOGbuffers == -1)
5062         {
5063                 char            buf[32];
5064
5065                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5066                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5067         }
5068         Assert(XLOGbuffers > 0);
5069
5070         /* XLogCtl */
5071         size = sizeof(XLogCtlData);
5072         /* xlblocks array */
5073         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5074         /* extra alignment padding for XLOG I/O buffers */
5075         size = add_size(size, ALIGNOF_XLOG_BUFFER);
5076         /* and the buffers themselves */
5077         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5078
5079         /*
5080          * Note: we don't count ControlFileData, it comes out of the "slop factor"
5081          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
5082          * routine again below to compute the actual allocation size.
5083          */
5084
5085         return size;
5086 }
5087
5088 void
5089 XLOGShmemInit(void)
5090 {
5091         bool            foundCFile,
5092                                 foundXLog;
5093         char       *allocptr;
5094
5095         ControlFile = (ControlFileData *)
5096                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5097         XLogCtl = (XLogCtlData *)
5098                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5099
5100         if (foundCFile || foundXLog)
5101         {
5102                 /* both should be present or neither */
5103                 Assert(foundCFile && foundXLog);
5104                 return;
5105         }
5106
5107         memset(XLogCtl, 0, sizeof(XLogCtlData));
5108
5109         /*
5110          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5111          * multiple of the alignment for same, so no extra alignment padding is
5112          * needed here.
5113          */
5114         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5115         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5116         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5117         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5118
5119         /*
5120          * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
5121          */
5122         allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
5123         XLogCtl->pages = allocptr;
5124         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5125
5126         /*
5127          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5128          * in additional info.)
5129          */
5130         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5131         XLogCtl->SharedRecoveryInProgress = true;
5132         XLogCtl->SharedHotStandbyActive = false;
5133         XLogCtl->WalWriterSleeping = false;
5134         XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
5135         SpinLockInit(&XLogCtl->info_lck);
5136         InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5137
5138         /*
5139          * If we are not in bootstrap mode, pg_control should already exist. Read
5140          * and validate it immediately (see comments in ReadControlFile() for the
5141          * reasons why).
5142          */
5143         if (!IsBootstrapProcessingMode())
5144                 ReadControlFile();
5145 }
5146
5147 /*
5148  * This func must be called ONCE on system install.  It creates pg_control
5149  * and the initial XLOG segment.
5150  */
5151 void
5152 BootStrapXLOG(void)
5153 {
5154         CheckPoint      checkPoint;
5155         char       *buffer;
5156         XLogPageHeader page;
5157         XLogLongPageHeader longpage;
5158         XLogRecord *record;
5159         bool            use_existent;
5160         uint64          sysidentifier;
5161         struct timeval tv;
5162         pg_crc32        crc;
5163
5164         /*
5165          * Select a hopefully-unique system identifier code for this installation.
5166          * We use the result of gettimeofday(), including the fractional seconds
5167          * field, as being about as unique as we can easily get.  (Think not to
5168          * use random(), since it hasn't been seeded and there's no portable way
5169          * to seed it other than the system clock value...)  The upper half of the
5170          * uint64 value is just the tv_sec part, while the lower half is the XOR
5171          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
5172          * unnecessarily if "uint64" is really only 32 bits wide.  A person
5173          * knowing this encoding can determine the initialization time of the
5174          * installation, which could perhaps be useful sometimes.
5175          */
5176         gettimeofday(&tv, NULL);
5177         sysidentifier = ((uint64) tv.tv_sec) << 32;
5178         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
5179
5180         /* First timeline ID is always 1 */
5181         ThisTimeLineID = 1;
5182
5183         /* page buffer must be aligned suitably for O_DIRECT */
5184         buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
5185         page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
5186         memset(page, 0, XLOG_BLCKSZ);
5187
5188         /*
5189          * Set up information for the initial checkpoint record
5190          *
5191          * The initial checkpoint record is written to the beginning of the WAL
5192          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5193          * used, so that we can use 0/0 to mean "before any valid WAL segment".
5194          */
5195         checkPoint.redo.xlogid = 0;
5196         checkPoint.redo.xrecoff = XLogSegSize + SizeOfXLogLongPHD;
5197         checkPoint.ThisTimeLineID = ThisTimeLineID;
5198         checkPoint.fullPageWrites = fullPageWrites;
5199         checkPoint.nextXidEpoch = 0;
5200         checkPoint.nextXid = FirstNormalTransactionId;
5201         checkPoint.nextOid = FirstBootstrapObjectId;
5202         checkPoint.nextMulti = FirstMultiXactId;
5203         checkPoint.nextMultiOffset = 0;
5204         checkPoint.oldestXid = FirstNormalTransactionId;
5205         checkPoint.oldestXidDB = TemplateDbOid;
5206         checkPoint.time = (pg_time_t) time(NULL);
5207         checkPoint.oldestActiveXid = InvalidTransactionId;
5208
5209         ShmemVariableCache->nextXid = checkPoint.nextXid;
5210         ShmemVariableCache->nextOid = checkPoint.nextOid;
5211         ShmemVariableCache->oidCount = 0;
5212         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5213         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5214
5215         /* Set up the XLOG page header */
5216         page->xlp_magic = XLOG_PAGE_MAGIC;
5217         page->xlp_info = XLP_LONG_HEADER;
5218         page->xlp_tli = ThisTimeLineID;
5219         page->xlp_pageaddr.xlogid = 0;
5220         page->xlp_pageaddr.xrecoff = XLogSegSize;
5221         longpage = (XLogLongPageHeader) page;
5222         longpage->xlp_sysid = sysidentifier;
5223         longpage->xlp_seg_size = XLogSegSize;
5224         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5225
5226         /* Insert the initial checkpoint record */
5227         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
5228         record->xl_prev.xlogid = 0;
5229         record->xl_prev.xrecoff = 0;
5230         record->xl_xid = InvalidTransactionId;
5231         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
5232         record->xl_len = sizeof(checkPoint);
5233         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5234         record->xl_rmid = RM_XLOG_ID;
5235         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
5236
5237         INIT_CRC32(crc);
5238         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
5239         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
5240                            SizeOfXLogRecord - sizeof(pg_crc32));
5241         FIN_CRC32(crc);
5242         record->xl_crc = crc;
5243
5244         /* Create first XLOG segment file */
5245         use_existent = false;
5246         openLogFile = XLogFileInit(0, 1, &use_existent, false);
5247
5248         /* Write the first page with the initial record */
5249         errno = 0;
5250         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5251         {
5252                 /* if write didn't set errno, assume problem is no disk space */
5253                 if (errno == 0)
5254                         errno = ENOSPC;
5255                 ereport(PANIC,
5256                                 (errcode_for_file_access(),
5257                           errmsg("could not write bootstrap transaction log file: %m")));
5258         }
5259
5260         if (pg_fsync(openLogFile) != 0)
5261                 ereport(PANIC,
5262                                 (errcode_for_file_access(),
5263                           errmsg("could not fsync bootstrap transaction log file: %m")));
5264
5265         if (close(openLogFile))
5266                 ereport(PANIC,
5267                                 (errcode_for_file_access(),
5268                           errmsg("could not close bootstrap transaction log file: %m")));
5269
5270         openLogFile = -1;
5271
5272         /* Now create pg_control */
5273
5274         memset(ControlFile, 0, sizeof(ControlFileData));
5275         /* Initialize pg_control status fields */
5276         ControlFile->system_identifier = sysidentifier;
5277         ControlFile->state = DB_SHUTDOWNED;
5278         ControlFile->time = checkPoint.time;
5279         ControlFile->checkPoint = checkPoint.redo;
5280         ControlFile->checkPointCopy = checkPoint;
5281
5282         /* Set important parameter values for use when replaying WAL */
5283         ControlFile->MaxConnections = MaxConnections;
5284         ControlFile->max_prepared_xacts = max_prepared_xacts;
5285         ControlFile->max_locks_per_xact = max_locks_per_xact;
5286         ControlFile->wal_level = wal_level;
5287
5288         /* some additional ControlFile fields are set in WriteControlFile() */
5289
5290         WriteControlFile();
5291
5292         /* Bootstrap the commit log, too */
5293         BootStrapCLOG();
5294         BootStrapSUBTRANS();
5295         BootStrapMultiXact();
5296
5297         pfree(buffer);
5298 }
5299
5300 static char *
5301 str_time(pg_time_t tnow)
5302 {
5303         static char buf[128];
5304
5305         pg_strftime(buf, sizeof(buf),
5306                                 "%Y-%m-%d %H:%M:%S %Z",
5307                                 pg_localtime(&tnow, log_timezone));
5308
5309         return buf;
5310 }
5311
5312 /*
5313  * See if there is a recovery command file (recovery.conf), and if so
5314  * read in parameters for archive recovery and XLOG streaming.
5315  *
5316  * The file is parsed using the main configuration parser.
5317  */
5318 static void
5319 readRecoveryCommandFile(void)
5320 {
5321         FILE       *fd;
5322         TimeLineID      rtli = 0;
5323         bool            rtliGiven = false;
5324         ConfigVariable *item,
5325                            *head = NULL,
5326                            *tail = NULL;
5327
5328         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5329         if (fd == NULL)
5330         {
5331                 if (errno == ENOENT)
5332                         return;                         /* not there, so no archive recovery */
5333                 ereport(FATAL,
5334                                 (errcode_for_file_access(),
5335                                  errmsg("could not open recovery command file \"%s\": %m",
5336                                                 RECOVERY_COMMAND_FILE)));
5337         }
5338
5339         /*
5340          * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5341          * no need to check the return value.
5342          */
5343         (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5344
5345         FreeFile(fd);
5346
5347         for (item = head; item; item = item->next)
5348         {
5349                 if (strcmp(item->name, "restore_command") == 0)
5350                 {
5351                         recoveryRestoreCommand = pstrdup(item->value);
5352                         ereport(DEBUG2,
5353                                         (errmsg_internal("restore_command = '%s'",
5354                                                                          recoveryRestoreCommand)));
5355                 }
5356                 else if (strcmp(item->name, "recovery_end_command") == 0)
5357                 {
5358                         recoveryEndCommand = pstrdup(item->value);
5359                         ereport(DEBUG2,
5360                                         (errmsg_internal("recovery_end_command = '%s'",
5361                                                                          recoveryEndCommand)));
5362                 }
5363                 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5364                 {
5365                         archiveCleanupCommand = pstrdup(item->value);
5366                         ereport(DEBUG2,
5367                                         (errmsg_internal("archive_cleanup_command = '%s'",
5368                                                                          archiveCleanupCommand)));
5369                 }
5370                 else if (strcmp(item->name, "pause_at_recovery_target") == 0)
5371                 {
5372                         if (!parse_bool(item->value, &recoveryPauseAtTarget))
5373                                 ereport(ERROR,
5374                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5375                                                  errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
5376                         ereport(DEBUG2,
5377                                         (errmsg_internal("pause_at_recovery_target = '%s'",
5378                                                                          item->value)));
5379                 }
5380                 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5381                 {
5382                         rtliGiven = true;
5383                         if (strcmp(item->value, "latest") == 0)
5384                                 rtli = 0;
5385                         else
5386                         {
5387                                 errno = 0;
5388                                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5389                                 if (errno == EINVAL || errno == ERANGE)
5390                                         ereport(FATAL,
5391                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5392                                                                         item->value)));
5393                         }
5394                         if (rtli)
5395                                 ereport(DEBUG2,
5396                                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
5397                         else
5398                                 ereport(DEBUG2,
5399                                          (errmsg_internal("recovery_target_timeline = latest")));
5400                 }
5401                 else if (strcmp(item->name, "recovery_target_xid") == 0)
5402                 {
5403                         errno = 0;
5404                         recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5405                         if (errno == EINVAL || errno == ERANGE)
5406                                 ereport(FATAL,
5407                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
5408                                                  item->value)));
5409                         ereport(DEBUG2,
5410                                         (errmsg_internal("recovery_target_xid = %u",
5411                                                                          recoveryTargetXid)));
5412                         recoveryTarget = RECOVERY_TARGET_XID;
5413                 }
5414                 else if (strcmp(item->name, "recovery_target_time") == 0)
5415                 {
5416                         /*
5417                          * if recovery_target_xid or recovery_target_name specified, then
5418                          * this overrides recovery_target_time
5419                          */
5420                         if (recoveryTarget == RECOVERY_TARGET_XID ||
5421                                 recoveryTarget == RECOVERY_TARGET_NAME)
5422                                 continue;
5423                         recoveryTarget = RECOVERY_TARGET_TIME;
5424
5425                         /*
5426                          * Convert the time string given by the user to TimestampTz form.
5427                          */
5428                         recoveryTargetTime =
5429                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5430                                                                                                 CStringGetDatum(item->value),
5431                                                                                                 ObjectIdGetDatum(InvalidOid),
5432                                                                                                                 Int32GetDatum(-1)));
5433                         ereport(DEBUG2,
5434                                         (errmsg_internal("recovery_target_time = '%s'",
5435                                                                    timestamptz_to_str(recoveryTargetTime))));
5436                 }
5437                 else if (strcmp(item->name, "recovery_target_name") == 0)
5438                 {
5439                         /*
5440                          * if recovery_target_xid specified, then this overrides
5441                          * recovery_target_name
5442                          */
5443                         if (recoveryTarget == RECOVERY_TARGET_XID)
5444                                 continue;
5445                         recoveryTarget = RECOVERY_TARGET_NAME;
5446
5447                         recoveryTargetName = pstrdup(item->value);
5448                         if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5449                                 ereport(FATAL,
5450                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5451                                                  errmsg("recovery_target_name is too long (maximum %d characters)",
5452                                                                 MAXFNAMELEN - 1)));
5453
5454                         ereport(DEBUG2,
5455                                         (errmsg_internal("recovery_target_name = '%s'",
5456                                                                          recoveryTargetName)));
5457                 }
5458                 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5459                 {
5460                         /*
5461                          * does nothing if a recovery_target is not also set
5462                          */
5463                         if (!parse_bool(item->value, &recoveryTargetInclusive))
5464                                 ereport(ERROR,
5465                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5466                                                  errmsg("parameter \"%s\" requires a Boolean value",
5467                                                                 "recovery_target_inclusive")));
5468                         ereport(DEBUG2,
5469                                         (errmsg_internal("recovery_target_inclusive = %s",
5470                                                                          item->value)));
5471                 }
5472                 else if (strcmp(item->name, "standby_mode") == 0)
5473                 {
5474                         if (!parse_bool(item->value, &StandbyMode))
5475                                 ereport(ERROR,
5476                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5477                                                  errmsg("parameter \"%s\" requires a Boolean value",
5478                                                                 "standby_mode")));
5479                         ereport(DEBUG2,
5480                                         (errmsg_internal("standby_mode = '%s'", item->value)));
5481                 }
5482                 else if (strcmp(item->name, "primary_conninfo") == 0)
5483                 {
5484                         PrimaryConnInfo = pstrdup(item->value);
5485                         ereport(DEBUG2,
5486                                         (errmsg_internal("primary_conninfo = '%s'",
5487                                                                          PrimaryConnInfo)));
5488                 }
5489                 else if (strcmp(item->name, "trigger_file") == 0)
5490                 {
5491                         TriggerFile = pstrdup(item->value);
5492                         ereport(DEBUG2,
5493                                         (errmsg_internal("trigger_file = '%s'",
5494                                                                          TriggerFile)));
5495                 }
5496                 else
5497                         ereport(FATAL,
5498                                         (errmsg("unrecognized recovery parameter \"%s\"",
5499                                                         item->name)));
5500         }
5501
5502         /*
5503          * Check for compulsory parameters
5504          */
5505         if (StandbyMode)
5506         {
5507                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5508                         ereport(WARNING,
5509                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5510                                                         RECOVERY_COMMAND_FILE),
5511                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5512         }
5513         else
5514         {
5515                 if (recoveryRestoreCommand == NULL)
5516                         ereport(FATAL,
5517                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5518                                                         RECOVERY_COMMAND_FILE)));
5519         }
5520
5521         /* Enable fetching from archive recovery area */
5522         InArchiveRecovery = true;
5523
5524         /*
5525          * If user specified recovery_target_timeline, validate it or compute the
5526          * "latest" value.      We can't do this until after we've gotten the restore
5527          * command and set InArchiveRecovery, because we need to fetch timeline
5528          * history files from the archive.
5529          */
5530         if (rtliGiven)
5531         {
5532                 if (rtli)
5533                 {
5534                         /* Timeline 1 does not have a history file, all else should */
5535                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5536                                 ereport(FATAL,
5537                                                 (errmsg("recovery target timeline %u does not exist",
5538                                                                 rtli)));
5539                         recoveryTargetTLI = rtli;
5540                         recoveryTargetIsLatest = false;
5541                 }
5542                 else
5543                 {
5544                         /* We start the "latest" search from pg_control's timeline */
5545                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5546                         recoveryTargetIsLatest = true;
5547                 }
5548         }
5549
5550         FreeConfigVariables(head);
5551 }
5552
5553 /*
5554  * Exit archive-recovery state
5555  */
5556 static void
5557 exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
5558 {
5559         char            recoveryPath[MAXPGPATH];
5560         char            xlogpath[MAXPGPATH];
5561
5562         /*
5563          * We are no longer in archive recovery state.
5564          */
5565         InArchiveRecovery = false;
5566
5567         /*
5568          * Update min recovery point one last time.
5569          */
5570         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5571
5572         /*
5573          * If the ending log segment is still open, close it (to avoid problems on
5574          * Windows with trying to rename or delete an open file).
5575          */
5576         if (readFile >= 0)
5577         {
5578                 close(readFile);
5579                 readFile = -1;
5580         }
5581
5582         /*
5583          * If we are establishing a new timeline, we have to copy data from the
5584          * last WAL segment of the old timeline to create a starting WAL segment
5585          * for the new timeline.
5586          *
5587          * Notify the archiver that the last WAL segment of the old timeline is
5588          * ready to copy to archival storage. Otherwise, it is not archived for a
5589          * while.
5590          */
5591         if (endTLI != ThisTimeLineID)
5592         {
5593                 XLogFileCopy(endLogId, endLogSeg,
5594                                          endTLI, endLogId, endLogSeg);
5595
5596                 if (XLogArchivingActive())
5597                 {
5598                         XLogFileName(xlogpath, endTLI, endLogId, endLogSeg);
5599                         XLogArchiveNotify(xlogpath);
5600                 }
5601         }
5602
5603         /*
5604          * Let's just make real sure there are not .ready or .done flags posted
5605          * for the new segment.
5606          */
5607         XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
5608         XLogArchiveCleanup(xlogpath);
5609
5610         /*
5611          * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5612          * of it.
5613          */
5614         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5615         unlink(recoveryPath);           /* ignore any error */
5616
5617         /* Get rid of any remaining recovered timeline-history file, too */
5618         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5619         unlink(recoveryPath);           /* ignore any error */
5620
5621         /*
5622          * Rename the config file out of the way, so that we don't accidentally
5623          * re-enter archive recovery mode in a subsequent crash.
5624          */
5625         unlink(RECOVERY_COMMAND_DONE);
5626         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5627                 ereport(FATAL,
5628                                 (errcode_for_file_access(),
5629                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5630                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5631
5632         ereport(LOG,
5633                         (errmsg("archive recovery complete")));
5634 }
5635
5636 /*
5637  * For point-in-time recovery, this function decides whether we want to
5638  * stop applying the XLOG at or after the current record.
5639  *
5640  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
5641  * *includeThis is set TRUE if we should apply this record before stopping.
5642  *
5643  * We also track the timestamp of the latest applied COMMIT/ABORT
5644  * record in XLogCtl->recoveryLastXTime, for logging purposes.
5645  * Also, some information is saved in recoveryStopXid et al for use in
5646  * annotating the new timeline's history file.
5647  */
5648 static bool
5649 recoveryStopsHere(XLogRecord *record, bool *includeThis)
5650 {
5651         bool            stopsHere;
5652         uint8           record_info;
5653         TimestampTz recordXtime;
5654         char            recordRPName[MAXFNAMELEN];
5655
5656         /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
5657         if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
5658                 return false;
5659         record_info = record->xl_info & ~XLR_INFO_MASK;
5660         if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
5661         {
5662                 xl_xact_commit_compact *recordXactCommitData;
5663
5664                 recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
5665                 recordXtime = recordXactCommitData->xact_time;
5666         }
5667         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
5668         {
5669                 xl_xact_commit *recordXactCommitData;
5670
5671                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
5672                 recordXtime = recordXactCommitData->xact_time;
5673         }
5674         else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
5675         {
5676                 xl_xact_abort *recordXactAbortData;
5677
5678                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
5679                 recordXtime = recordXactAbortData->xact_time;
5680         }
5681         else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
5682         {
5683                 xl_restore_point *recordRestorePointData;
5684
5685                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5686                 recordXtime = recordRestorePointData->rp_time;
5687                 strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
5688         }
5689         else
5690                 return false;
5691
5692         /* Do we have a PITR target at all? */
5693         if (recoveryTarget == RECOVERY_TARGET_UNSET)
5694         {
5695                 /*
5696                  * Save timestamp of latest transaction commit/abort if this is a
5697                  * transaction record
5698                  */
5699                 if (record->xl_rmid == RM_XACT_ID)
5700                         SetLatestXTime(recordXtime);
5701                 return false;
5702         }
5703
5704         if (recoveryTarget == RECOVERY_TARGET_XID)
5705         {
5706                 /*
5707                  * There can be only one transaction end record with this exact
5708                  * transactionid
5709                  *
5710                  * when testing for an xid, we MUST test for equality only, since
5711                  * transactions are numbered in the order they start, not the order
5712                  * they complete. A higher numbered xid will complete before you about
5713                  * 50% of the time...
5714                  */
5715                 stopsHere = (record->xl_xid == recoveryTargetXid);
5716                 if (stopsHere)
5717                         *includeThis = recoveryTargetInclusive;
5718         }
5719         else if (recoveryTarget == RECOVERY_TARGET_NAME)
5720         {
5721                 /*
5722                  * There can be many restore points that share the same name, so we
5723                  * stop at the first one
5724                  */
5725                 stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);
5726
5727                 /*
5728                  * Ignore recoveryTargetInclusive because this is not a transaction
5729                  * record
5730                  */
5731                 *includeThis = false;
5732         }
5733         else
5734         {
5735                 /*
5736                  * There can be many transactions that share the same commit time, so
5737                  * we stop after the last one, if we are inclusive, or stop at the
5738                  * first one if we are exclusive
5739                  */
5740                 if (recoveryTargetInclusive)
5741                         stopsHere = (recordXtime > recoveryTargetTime);
5742                 else
5743                         stopsHere = (recordXtime >= recoveryTargetTime);
5744                 if (stopsHere)
5745                         *includeThis = false;
5746         }
5747
5748         if (stopsHere)
5749         {
5750                 recoveryStopXid = record->xl_xid;
5751                 recoveryStopTime = recordXtime;
5752                 recoveryStopAfter = *includeThis;
5753
5754                 if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
5755                 {
5756                         if (recoveryStopAfter)
5757                                 ereport(LOG,
5758                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5759                                                                 recoveryStopXid,
5760                                                                 timestamptz_to_str(recoveryStopTime))));
5761                         else
5762                                 ereport(LOG,
5763                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
5764                                                                 recoveryStopXid,
5765                                                                 timestamptz_to_str(recoveryStopTime))));
5766                 }
5767                 else if (record_info == XLOG_XACT_ABORT)
5768                 {
5769                         if (recoveryStopAfter)
5770                                 ereport(LOG,
5771                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5772                                                                 recoveryStopXid,
5773                                                                 timestamptz_to_str(recoveryStopTime))));
5774                         else
5775                                 ereport(LOG,
5776                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
5777                                                                 recoveryStopXid,
5778                                                                 timestamptz_to_str(recoveryStopTime))));
5779                 }
5780                 else
5781                 {
5782                         strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);
5783
5784                         ereport(LOG,
5785                                 (errmsg("recovery stopping at restore point \"%s\", time %s",
5786                                                 recoveryStopName,
5787                                                 timestamptz_to_str(recoveryStopTime))));
5788                 }
5789
5790                 /*
5791                  * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
5792                  * restore point since they are timestamped, though the latest
5793                  * transaction time is not updated.
5794                  */
5795                 if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
5796                         SetLatestXTime(recordXtime);
5797         }
5798         else if (record->xl_rmid == RM_XACT_ID)
5799                 SetLatestXTime(recordXtime);
5800
5801         return stopsHere;
5802 }
5803
5804 /*
5805  * Recheck shared recoveryPause by polling.
5806  *
5807  * XXX Can also be done with shared latch.
5808  */
5809 static void
5810 recoveryPausesHere(void)
5811 {
5812         ereport(LOG,
5813                         (errmsg("recovery has paused"),
5814                          errhint("Execute pg_xlog_replay_resume() to continue.")));
5815
5816         while (RecoveryIsPaused())
5817         {
5818                 pg_usleep(1000000L);    /* 1000 ms */
5819                 HandleStartupProcInterrupts();
5820         }
5821 }
5822
5823 bool
5824 RecoveryIsPaused(void)
5825 {
5826         /* use volatile pointer to prevent code rearrangement */
5827         volatile XLogCtlData *xlogctl = XLogCtl;
5828         bool            recoveryPause;
5829
5830         SpinLockAcquire(&xlogctl->info_lck);
5831         recoveryPause = xlogctl->recoveryPause;
5832         SpinLockRelease(&xlogctl->info_lck);
5833
5834         return recoveryPause;
5835 }
5836
5837 void
5838 SetRecoveryPause(bool recoveryPause)
5839 {
5840         /* use volatile pointer to prevent code rearrangement */
5841         volatile XLogCtlData *xlogctl = XLogCtl;
5842
5843         SpinLockAcquire(&xlogctl->info_lck);
5844         xlogctl->recoveryPause = recoveryPause;
5845         SpinLockRelease(&xlogctl->info_lck);
5846 }
5847
5848 /*
5849  * Save timestamp of latest processed commit/abort record.
5850  *
5851  * We keep this in XLogCtl, not a simple static variable, so that it can be
5852  * seen by processes other than the startup process.  Note in particular
5853  * that CreateRestartPoint is executed in the checkpointer.
5854  */
5855 static void
5856 SetLatestXTime(TimestampTz xtime)
5857 {
5858         /* use volatile pointer to prevent code rearrangement */
5859         volatile XLogCtlData *xlogctl = XLogCtl;
5860
5861         SpinLockAcquire(&xlogctl->info_lck);
5862         xlogctl->recoveryLastXTime = xtime;
5863         SpinLockRelease(&xlogctl->info_lck);
5864 }
5865
5866 /*
5867  * Fetch timestamp of latest processed commit/abort record.
5868  */
5869 TimestampTz
5870 GetLatestXTime(void)
5871 {
5872         /* use volatile pointer to prevent code rearrangement */
5873         volatile XLogCtlData *xlogctl = XLogCtl;
5874         TimestampTz xtime;
5875
5876         SpinLockAcquire(&xlogctl->info_lck);
5877         xtime = xlogctl->recoveryLastXTime;
5878         SpinLockRelease(&xlogctl->info_lck);
5879
5880         return xtime;
5881 }
5882
5883 /*
5884  * Save timestamp of the next chunk of WAL records to apply.
5885  *
5886  * We keep this in XLogCtl, not a simple static variable, so that it can be
5887  * seen by all backends.
5888  */
5889 static void
5890 SetCurrentChunkStartTime(TimestampTz xtime)
5891 {
5892         /* use volatile pointer to prevent code rearrangement */
5893         volatile XLogCtlData *xlogctl = XLogCtl;
5894
5895         SpinLockAcquire(&xlogctl->info_lck);
5896         xlogctl->currentChunkStartTime = xtime;
5897         SpinLockRelease(&xlogctl->info_lck);
5898 }
5899
5900 /*
5901  * Fetch timestamp of latest processed commit/abort record.
5902  * Startup process maintains an accurate local copy in XLogReceiptTime
5903  */
5904 TimestampTz
5905 GetCurrentChunkReplayStartTime(void)
5906 {
5907         /* use volatile pointer to prevent code rearrangement */
5908         volatile XLogCtlData *xlogctl = XLogCtl;
5909         TimestampTz xtime;
5910
5911         SpinLockAcquire(&xlogctl->info_lck);
5912         xtime = xlogctl->currentChunkStartTime;
5913         SpinLockRelease(&xlogctl->info_lck);
5914
5915         return xtime;
5916 }
5917
5918 /*
5919  * Returns time of receipt of current chunk of XLOG data, as well as
5920  * whether it was received from streaming replication or from archives.
5921  */
5922 void
5923 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
5924 {
5925         /*
5926          * This must be executed in the startup process, since we don't export the
5927          * relevant state to shared memory.
5928          */
5929         Assert(InRecovery);
5930
5931         *rtime = XLogReceiptTime;
5932         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
5933 }
5934
5935 /*
5936  * Note that text field supplied is a parameter name and does not require
5937  * translation
5938  */
5939 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
5940 do { \
5941         if (currValue < minValue) \
5942                 ereport(ERROR, \
5943                                 (errmsg("hot standby is not possible because " \
5944                                                 "%s = %d is a lower setting than on the master server " \
5945                                                 "(its value was %d)", \
5946                                                 param_name, \
5947                                                 currValue, \
5948                                                 minValue))); \
5949 } while(0)
5950
5951 /*
5952  * Check to see if required parameters are set high enough on this server
5953  * for various aspects of recovery operation.
5954  */
5955 static void
5956 CheckRequiredParameterValues(void)
5957 {
5958         /*
5959          * For archive recovery, the WAL must be generated with at least 'archive'
5960          * wal_level.
5961          */
5962         if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
5963         {
5964                 ereport(WARNING,
5965                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
5966                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
5967         }
5968
5969         /*
5970          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
5971          * we must have at least as many backend slots as the primary.
5972          */
5973         if (InArchiveRecovery && EnableHotStandby)
5974         {
5975                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
5976                         ereport(ERROR,
5977                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
5978                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
5979
5980                 /* We ignore autovacuum_max_workers when we make this test. */
5981                 RecoveryRequiresIntParameter("max_connections",
5982                                                                          MaxConnections,
5983                                                                          ControlFile->MaxConnections);
5984                 RecoveryRequiresIntParameter("max_prepared_xacts",
5985                                                                          max_prepared_xacts,
5986                                                                          ControlFile->max_prepared_xacts);
5987                 RecoveryRequiresIntParameter("max_locks_per_xact",
5988                                                                          max_locks_per_xact,
5989                                                                          ControlFile->max_locks_per_xact);
5990         }
5991 }
5992
5993 /*
5994  * This must be called ONCE during postmaster or standalone-backend startup
5995  */
5996 void
5997 StartupXLOG(void)
5998 {
5999         XLogCtlInsert *Insert;
6000         CheckPoint      checkPoint;
6001         bool            wasShutdown;
6002         bool            reachedStopPoint = false;
6003         bool            haveBackupLabel = false;
6004         XLogRecPtr      RecPtr,
6005                                 checkPointLoc,
6006                                 EndOfLog;
6007         uint32          endLogId;
6008         uint32          endLogSeg;
6009         XLogRecord *record;
6010         uint32          freespace;
6011         TransactionId oldestActiveXID;
6012         bool            backupEndRequired = false;
6013         bool            backupFromStandby = false;
6014         DBState         dbstate_at_startup;
6015
6016         /*
6017          * Read control file and check XLOG status looks valid.
6018          *
6019          * Note: in most control paths, *ControlFile is already valid and we need
6020          * not do ReadControlFile() here, but might as well do it to be sure.
6021          */
6022         ReadControlFile();
6023
6024         if (ControlFile->state < DB_SHUTDOWNED ||
6025                 ControlFile->state > DB_IN_PRODUCTION ||
6026                 !XRecOffIsValid(ControlFile->checkPoint.xrecoff))
6027                 ereport(FATAL,
6028                                 (errmsg("control file contains invalid data")));
6029
6030         if (ControlFile->state == DB_SHUTDOWNED)
6031                 ereport(LOG,
6032                                 (errmsg("database system was shut down at %s",
6033                                                 str_time(ControlFile->time))));
6034         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6035                 ereport(LOG,
6036                                 (errmsg("database system was shut down in recovery at %s",
6037                                                 str_time(ControlFile->time))));
6038         else if (ControlFile->state == DB_SHUTDOWNING)
6039                 ereport(LOG,
6040                                 (errmsg("database system shutdown was interrupted; last known up at %s",
6041                                                 str_time(ControlFile->time))));
6042         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6043                 ereport(LOG,
6044                    (errmsg("database system was interrupted while in recovery at %s",
6045                                    str_time(ControlFile->time)),
6046                         errhint("This probably means that some data is corrupted and"
6047                                         " you will have to use the last backup for recovery.")));
6048         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6049                 ereport(LOG,
6050                                 (errmsg("database system was interrupted while in recovery at log time %s",
6051                                                 str_time(ControlFile->checkPointCopy.time)),
6052                                  errhint("If this has occurred more than once some data might be corrupted"
6053                           " and you might need to choose an earlier recovery target.")));
6054         else if (ControlFile->state == DB_IN_PRODUCTION)
6055                 ereport(LOG,
6056                           (errmsg("database system was interrupted; last known up at %s",
6057                                           str_time(ControlFile->time))));
6058
6059         /* This is just to allow attaching to startup process with a debugger */
6060 #ifdef XLOG_REPLAY_DELAY
6061         if (ControlFile->state != DB_SHUTDOWNED)
6062                 pg_usleep(60000000L);
6063 #endif
6064
6065         /*
6066          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
6067          * someone has performed a copy for PITR, these directories may have been
6068          * excluded and need to be re-created.
6069          */
6070         ValidateXLOGDirectoryStructure();
6071
6072         /*
6073          * Clear out any old relcache cache files.      This is *necessary* if we do
6074          * any WAL replay, since that would probably result in the cache files
6075          * being out of sync with database reality.  In theory we could leave them
6076          * in place if the database had been cleanly shut down, but it seems
6077          * safest to just remove them always and let them be rebuilt during the
6078          * first backend startup.
6079          */
6080         RelationCacheInitFileRemove();
6081
6082         /*
6083          * Initialize on the assumption we want to recover to the same timeline
6084          * that's active according to pg_control.
6085          */
6086         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6087
6088         /*
6089          * Check for recovery control file, and if so set up state for offline
6090          * recovery
6091          */
6092         readRecoveryCommandFile();
6093
6094         /* Now we can determine the list of expected TLIs */
6095         expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
6096
6097         /*
6098          * If pg_control's timeline is not in expectedTLIs, then we cannot
6099          * proceed: the backup is not part of the history of the requested
6100          * timeline.
6101          */
6102         if (!list_member_int(expectedTLIs,
6103                                                  (int) ControlFile->checkPointCopy.ThisTimeLineID))
6104                 ereport(FATAL,
6105                                 (errmsg("requested timeline %u is not a child of database system timeline %u",
6106                                                 recoveryTargetTLI,
6107                                                 ControlFile->checkPointCopy.ThisTimeLineID)));
6108
6109         /*
6110          * Save the selected recovery target timeline ID and
6111          * archive_cleanup_command in shared memory so that other processes can
6112          * see them
6113          */
6114         XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;
6115         strncpy(XLogCtl->archiveCleanupCommand,
6116                         archiveCleanupCommand ? archiveCleanupCommand : "",
6117                         sizeof(XLogCtl->archiveCleanupCommand));
6118
6119         if (InArchiveRecovery)
6120         {
6121                 if (StandbyMode)
6122                         ereport(LOG,
6123                                         (errmsg("entering standby mode")));
6124                 else if (recoveryTarget == RECOVERY_TARGET_XID)
6125                         ereport(LOG,
6126                                         (errmsg("starting point-in-time recovery to XID %u",
6127                                                         recoveryTargetXid)));
6128                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6129                         ereport(LOG,
6130                                         (errmsg("starting point-in-time recovery to %s",
6131                                                         timestamptz_to_str(recoveryTargetTime))));
6132                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6133                         ereport(LOG,
6134                                         (errmsg("starting point-in-time recovery to \"%s\"",
6135                                                         recoveryTargetName)));
6136                 else
6137                         ereport(LOG,
6138                                         (errmsg("starting archive recovery")));
6139         }
6140
6141         /*
6142          * Take ownership of the wakeup latch if we're going to sleep during
6143          * recovery.
6144          */
6145         if (StandbyMode)
6146                 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6147
6148         if (read_backup_label(&checkPointLoc, &backupEndRequired,
6149                                                   &backupFromStandby))
6150         {
6151                 /*
6152                  * When a backup_label file is present, we want to roll forward from
6153                  * the checkpoint it identifies, rather than using pg_control.
6154                  */
6155                 record = ReadCheckpointRecord(checkPointLoc, 0);
6156                 if (record != NULL)
6157                 {
6158                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6159                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6160                         ereport(DEBUG1,
6161                                         (errmsg("checkpoint record is at %X/%X",
6162                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
6163                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
6164
6165                         /*
6166                          * Make sure that REDO location exists. This may not be the case
6167                          * if there was a crash during an online backup, which left a
6168                          * backup_label around that references a WAL segment that's
6169                          * already been archived.
6170                          */
6171                         if (XLByteLT(checkPoint.redo, checkPointLoc))
6172                         {
6173                                 if (!ReadRecord(&(checkPoint.redo), LOG, false))
6174                                         ereport(FATAL,
6175                                                         (errmsg("could not find redo location referenced by checkpoint record"),
6176                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6177                         }
6178                 }
6179                 else
6180                 {
6181                         ereport(FATAL,
6182                                         (errmsg("could not locate required checkpoint record"),
6183                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6184                         wasShutdown = false;    /* keep compiler quiet */
6185                 }
6186                 /* set flag to delete it later */
6187                 haveBackupLabel = true;
6188         }
6189         else
6190         {
6191                 /*
6192                  * Get the last valid checkpoint record.  If the latest one according
6193                  * to pg_control is broken, try the next-to-last one.
6194                  */
6195                 checkPointLoc = ControlFile->checkPoint;
6196                 RedoStartLSN = ControlFile->checkPointCopy.redo;
6197                 record = ReadCheckpointRecord(checkPointLoc, 1);
6198                 if (record != NULL)
6199                 {
6200                         ereport(DEBUG1,
6201                                         (errmsg("checkpoint record is at %X/%X",
6202                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
6203                 }
6204                 else if (StandbyMode)
6205                 {
6206                         /*
6207                          * The last valid checkpoint record required for a streaming
6208                          * recovery exists in neither standby nor the primary.
6209                          */
6210                         ereport(PANIC,
6211                                         (errmsg("could not locate a valid checkpoint record")));
6212                 }
6213                 else
6214                 {
6215                         checkPointLoc = ControlFile->prevCheckPoint;
6216                         record = ReadCheckpointRecord(checkPointLoc, 2);
6217                         if (record != NULL)
6218                         {
6219                                 ereport(LOG,
6220                                                 (errmsg("using previous checkpoint record at %X/%X",
6221                                                           checkPointLoc.xlogid, checkPointLoc.xrecoff)));
6222                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
6223                         }
6224                         else
6225                                 ereport(PANIC,
6226                                          (errmsg("could not locate a valid checkpoint record")));
6227                 }
6228                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6229                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6230         }
6231
6232         LastRec = RecPtr = checkPointLoc;
6233
6234         ereport(DEBUG1,
6235                         (errmsg("redo record is at %X/%X; shutdown %s",
6236                                         checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
6237                                         wasShutdown ? "TRUE" : "FALSE")));
6238         ereport(DEBUG1,
6239                         (errmsg("next transaction ID: %u/%u; next OID: %u",
6240                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
6241                                         checkPoint.nextOid)));
6242         ereport(DEBUG1,
6243                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
6244                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6245         ereport(DEBUG1,
6246                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
6247                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
6248         if (!TransactionIdIsNormal(checkPoint.nextXid))
6249                 ereport(PANIC,
6250                                 (errmsg("invalid next transaction ID")));
6251
6252         ShmemVariableCache->nextXid = checkPoint.nextXid;
6253         ShmemVariableCache->nextOid = checkPoint.nextOid;
6254         ShmemVariableCache->oidCount = 0;
6255         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6256         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6257
6258         /*
6259          * We must replay WAL entries using the same TimeLineID they were created
6260          * under, so temporarily adopt the TLI indicated by the checkpoint (see
6261          * also xlog_redo()).
6262          */
6263         ThisTimeLineID = checkPoint.ThisTimeLineID;
6264
6265         lastFullPageWrites = checkPoint.fullPageWrites;
6266
6267         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6268
6269         if (XLByteLT(RecPtr, checkPoint.redo))
6270                 ereport(PANIC,
6271                                 (errmsg("invalid redo in checkpoint record")));
6272
6273         /*
6274          * Check whether we need to force recovery from WAL.  If it appears to
6275          * have been a clean shutdown and we did not have a recovery.conf file,
6276          * then assume no recovery needed.
6277          */
6278         if (XLByteLT(checkPoint.redo, RecPtr))
6279         {
6280                 if (wasShutdown)
6281                         ereport(PANIC,
6282                                         (errmsg("invalid redo record in shutdown checkpoint")));
6283                 InRecovery = true;
6284         }
6285         else if (ControlFile->state != DB_SHUTDOWNED)
6286                 InRecovery = true;
6287         else if (InArchiveRecovery)
6288         {
6289                 /* force recovery due to presence of recovery.conf */
6290                 InRecovery = true;
6291         }
6292
6293         /* REDO */
6294         if (InRecovery)
6295         {
6296                 int                     rmid;
6297
6298                 /* use volatile pointer to prevent code rearrangement */
6299                 volatile XLogCtlData *xlogctl = XLogCtl;
6300
6301                 /*
6302                  * Update pg_control to show that we are recovering and to show the
6303                  * selected checkpoint as the place we are starting from. We also mark
6304                  * pg_control with any minimum recovery stop point obtained from a
6305                  * backup history file.
6306                  */
6307                 dbstate_at_startup = ControlFile->state;
6308                 if (InArchiveRecovery)
6309                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6310                 else
6311                 {
6312                         ereport(LOG,
6313                                         (errmsg("database system was not properly shut down; "
6314                                                         "automatic recovery in progress")));
6315                         ControlFile->state = DB_IN_CRASH_RECOVERY;
6316                 }
6317                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6318                 ControlFile->checkPoint = checkPointLoc;
6319                 ControlFile->checkPointCopy = checkPoint;
6320                 if (InArchiveRecovery)
6321                 {
6322                         /* initialize minRecoveryPoint if not set yet */
6323                         if (XLByteLT(ControlFile->minRecoveryPoint, checkPoint.redo))
6324                                 ControlFile->minRecoveryPoint = checkPoint.redo;
6325                 }
6326
6327                 /*
6328                  * Set backupStartPoint if we're starting recovery from a base backup.
6329                  *
6330                  * Set backupEndPoint and use minRecoveryPoint as the backup end
6331                  * location if we're starting recovery from a base backup which was
6332                  * taken from the standby. In this case, the database system status in
6333                  * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
6334                  * means that backup is corrupted, so we cancel recovery.
6335                  */
6336                 if (haveBackupLabel)
6337                 {
6338                         ControlFile->backupStartPoint = checkPoint.redo;
6339                         ControlFile->backupEndRequired = backupEndRequired;
6340
6341                         if (backupFromStandby)
6342                         {
6343                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
6344                                         ereport(FATAL,
6345                                                         (errmsg("backup_label contains inconsistent data with control file"),
6346                                                          errhint("This means that the backup is corrupted and you will "
6347                                                            "have to use another backup for recovery.")));
6348                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6349                         }
6350                 }
6351                 ControlFile->time = (pg_time_t) time(NULL);
6352                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6353                 UpdateControlFile();
6354
6355                 /* initialize shared-memory copy of latest checkpoint XID/epoch */
6356                 XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
6357                 XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
6358
6359                 /* initialize our local copy of minRecoveryPoint */
6360                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6361
6362                 /*
6363                  * Reset pgstat data, because it may be invalid after recovery.
6364                  */
6365                 pgstat_reset_all();
6366
6367                 /*
6368                  * If there was a backup label file, it's done its job and the info
6369                  * has now been propagated into pg_control.  We must get rid of the
6370                  * label file so that if we crash during recovery, we'll pick up at
6371                  * the latest recovery restartpoint instead of going all the way back
6372                  * to the backup start point.  It seems prudent though to just rename
6373                  * the file out of the way rather than delete it completely.
6374                  */
6375                 if (haveBackupLabel)
6376                 {
6377                         unlink(BACKUP_LABEL_OLD);
6378                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
6379                                 ereport(FATAL,
6380                                                 (errcode_for_file_access(),
6381                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
6382                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
6383                 }
6384
6385                 /* Check that the GUCs used to generate the WAL allow recovery */
6386                 CheckRequiredParameterValues();
6387
6388                 /*
6389                  * We're in recovery, so unlogged relations may be trashed and must be
6390                  * reset.  This should be done BEFORE allowing Hot Standby
6391                  * connections, so that read-only backends don't try to read whatever
6392                  * garbage is left over from before.
6393                  */
6394                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6395
6396                 /*
6397                  * Likewise, delete any saved transaction snapshot files that got left
6398                  * behind by crashed backends.
6399                  */
6400                 DeleteAllExportedSnapshotFiles();
6401
6402                 /*
6403                  * Initialize for Hot Standby, if enabled. We won't let backends in
6404                  * yet, not until we've reached the min recovery point specified in
6405                  * control file and we've established a recovery snapshot from a
6406                  * running-xacts WAL record.
6407                  */
6408                 if (InArchiveRecovery && EnableHotStandby)
6409                 {
6410                         TransactionId *xids;
6411                         int                     nxids;
6412
6413                         ereport(DEBUG1,
6414                                         (errmsg("initializing for hot standby")));
6415
6416                         InitRecoveryTransactionEnvironment();
6417
6418                         if (wasShutdown)
6419                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6420                         else
6421                                 oldestActiveXID = checkPoint.oldestActiveXid;
6422                         Assert(TransactionIdIsValid(oldestActiveXID));
6423
6424                         /*
6425                          * Startup commit log and subtrans only. Other SLRUs are not
6426                          * maintained during recovery and need not be started yet.
6427                          */
6428                         StartupCLOG();
6429                         StartupSUBTRANS(oldestActiveXID);
6430
6431                         /*
6432                          * If we're beginning at a shutdown checkpoint, we know that
6433                          * nothing was running on the master at this point. So fake-up an
6434                          * empty running-xacts record and use that here and now. Recover
6435                          * additional standby state for prepared transactions.
6436                          */
6437                         if (wasShutdown)
6438                         {
6439                                 RunningTransactionsData running;
6440                                 TransactionId latestCompletedXid;
6441
6442                                 /*
6443                                  * Construct a RunningTransactions snapshot representing a
6444                                  * shut down server, with only prepared transactions still
6445                                  * alive. We're never overflowed at this point because all
6446                                  * subxids are listed with their parent prepared transactions.
6447                                  */
6448                                 running.xcnt = nxids;
6449                                 running.subxid_overflow = false;
6450                                 running.nextXid = checkPoint.nextXid;
6451                                 running.oldestRunningXid = oldestActiveXID;
6452                                 latestCompletedXid = checkPoint.nextXid;
6453                                 TransactionIdRetreat(latestCompletedXid);
6454                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6455                                 running.latestCompletedXid = latestCompletedXid;
6456                                 running.xids = xids;
6457
6458                                 ProcArrayApplyRecoveryInfo(&running);
6459
6460                                 StandbyRecoverPreparedTransactions(false);
6461                         }
6462                 }
6463
6464                 /* Initialize resource managers */
6465                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6466                 {
6467                         if (RmgrTable[rmid].rm_startup != NULL)
6468                                 RmgrTable[rmid].rm_startup();
6469                 }
6470
6471                 /*
6472                  * Initialize shared replayEndRecPtr, recoveryLastRecPtr, and
6473                  * recoveryLastXTime.
6474                  *
6475                  * This is slightly confusing if we're starting from an online
6476                  * checkpoint; we've just read and replayed the chekpoint record, but
6477                  * we're going to start replay from its redo pointer, which precedes
6478                  * the location of the checkpoint record itself. So even though the
6479                  * last record we've replayed is indeed ReadRecPtr, we haven't
6480                  * replayed all the preceding records yet. That's OK for the current
6481                  * use of these variables.
6482                  */
6483                 SpinLockAcquire(&xlogctl->info_lck);
6484                 xlogctl->replayEndRecPtr = ReadRecPtr;
6485                 xlogctl->recoveryLastRecPtr = EndRecPtr;
6486                 xlogctl->recoveryLastXTime = 0;
6487                 xlogctl->currentChunkStartTime = 0;
6488                 xlogctl->recoveryPause = false;
6489                 SpinLockRelease(&xlogctl->info_lck);
6490
6491                 /* Also ensure XLogReceiptTime has a sane value */
6492                 XLogReceiptTime = GetCurrentTimestamp();
6493
6494                 /*
6495                  * Let postmaster know we've started redo now, so that it can launch
6496                  * checkpointer to perform restartpoints.  We don't bother during
6497                  * crash recovery as restartpoints can only be performed during
6498                  * archive recovery.  And we'd like to keep crash recovery simple, to
6499                  * avoid introducing bugs that could affect you when recovering after
6500                  * crash.
6501                  *
6502                  * After this point, we can no longer assume that we're the only
6503                  * process in addition to postmaster!  Also, fsync requests are
6504                  * subsequently to be handled by the checkpointer, not locally.
6505                  */
6506                 if (InArchiveRecovery && IsUnderPostmaster)
6507                 {
6508                         PublishStartupProcessInformation();
6509                         SetForwardFsyncRequests();
6510                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6511                         bgwriterLaunched = true;
6512                 }
6513
6514                 /*
6515                  * Allow read-only connections immediately if we're consistent
6516                  * already.
6517                  */
6518                 CheckRecoveryConsistency();
6519
6520                 /*
6521                  * Find the first record that logically follows the checkpoint --- it
6522                  * might physically precede it, though.
6523                  */
6524                 if (XLByteLT(checkPoint.redo, RecPtr))
6525                 {
6526                         /* back up to find the record */
6527                         record = ReadRecord(&(checkPoint.redo), PANIC, false);
6528                 }
6529                 else
6530                 {
6531                         /* just have to read next record after CheckPoint */
6532                         record = ReadRecord(NULL, LOG, false);
6533                 }
6534
6535                 if (record != NULL)
6536                 {
6537                         bool            recoveryContinue = true;
6538                         bool            recoveryApply = true;
6539                         bool            recoveryPause = false;
6540                         ErrorContextCallback errcontext;
6541                         TimestampTz xtime;
6542
6543                         InRedo = true;
6544
6545                         ereport(LOG,
6546                                         (errmsg("redo starts at %X/%X",
6547                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
6548
6549                         /*
6550                          * main redo apply loop
6551                          */
6552                         do
6553                         {
6554 #ifdef WAL_DEBUG
6555                                 if (XLOG_DEBUG ||
6556                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6557                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6558                                 {
6559                                         StringInfoData buf;
6560
6561                                         initStringInfo(&buf);
6562                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6563                                                                          ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
6564                                                                          EndRecPtr.xlogid, EndRecPtr.xrecoff);
6565                                         xlog_outrec(&buf, record);
6566                                         appendStringInfo(&buf, " - ");
6567                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
6568                                                                                                            record->xl_info,
6569                                                                                                          XLogRecGetData(record));
6570                                         elog(LOG, "%s", buf.data);
6571                                         pfree(buf.data);
6572                                 }
6573 #endif
6574
6575                                 /* Handle interrupt signals of startup process */
6576                                 HandleStartupProcInterrupts();
6577
6578                                 /* Allow read-only connections if we're consistent now */
6579                                 CheckRecoveryConsistency();
6580
6581                                 /*
6582                                  * Have we reached our recovery target?
6583                                  */
6584                                 if (recoveryStopsHere(record, &recoveryApply))
6585                                 {
6586                                         /*
6587                                          * Pause only if users can connect to send a resume
6588                                          * message
6589                                          */
6590                                         if (recoveryPauseAtTarget && standbyState == STANDBY_SNAPSHOT_READY)
6591                                         {
6592                                                 SetRecoveryPause(true);
6593                                                 recoveryPausesHere();
6594                                         }
6595                                         reachedStopPoint = true;        /* see below */
6596                                         recoveryContinue = false;
6597                                         if (!recoveryApply)
6598                                                 break;
6599                                 }
6600
6601                                 /* Setup error traceback support for ereport() */
6602                                 errcontext.callback = rm_redo_error_callback;
6603                                 errcontext.arg = (void *) record;
6604                                 errcontext.previous = error_context_stack;
6605                                 error_context_stack = &errcontext;
6606
6607                                 /*
6608                                  * ShmemVariableCache->nextXid must be beyond record's xid.
6609                                  *
6610                                  * We don't expect anyone else to modify nextXid, hence we
6611                                  * don't need to hold a lock while examining it.  We still
6612                                  * acquire the lock to modify it, though.
6613                                  */
6614                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
6615                                                                                                  ShmemVariableCache->nextXid))
6616                                 {
6617                                         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
6618                                         ShmemVariableCache->nextXid = record->xl_xid;
6619                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
6620                                         LWLockRelease(XidGenLock);
6621                                 }
6622
6623                                 /*
6624                                  * Update shared replayEndRecPtr before replaying this record,
6625                                  * so that XLogFlush will update minRecoveryPoint correctly.
6626                                  */
6627                                 SpinLockAcquire(&xlogctl->info_lck);
6628                                 xlogctl->replayEndRecPtr = EndRecPtr;
6629                                 recoveryPause = xlogctl->recoveryPause;
6630                                 SpinLockRelease(&xlogctl->info_lck);
6631
6632                                 /*
6633                                  * Pause only if users can connect to send a resume message
6634                                  */
6635                                 if (recoveryPause && standbyState == STANDBY_SNAPSHOT_READY)
6636                                         recoveryPausesHere();
6637
6638                                 /*
6639                                  * If we are attempting to enter Hot Standby mode, process
6640                                  * XIDs we see
6641                                  */
6642                                 if (standbyState >= STANDBY_INITIALIZED &&
6643                                         TransactionIdIsValid(record->xl_xid))
6644                                         RecordKnownAssignedTransactionIds(record->xl_xid);
6645
6646                                 /* Now apply the WAL record itself */
6647                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
6648
6649                                 /* Pop the error context stack */
6650                                 error_context_stack = errcontext.previous;
6651
6652                                 if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
6653                                         XLByteLE(ControlFile->backupEndPoint, EndRecPtr))
6654                                 {
6655                                         /*
6656                                          * We have reached the end of base backup, the point where
6657                                          * the minimum recovery point in pg_control indicates. The
6658                                          * data on disk is now consistent. Reset backupStartPoint
6659                                          * and backupEndPoint.
6660                                          */
6661                                         elog(DEBUG1, "end of backup reached");
6662
6663                                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6664
6665                                         MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
6666                                         MemSet(&ControlFile->backupEndPoint, 0, sizeof(XLogRecPtr));
6667                                         ControlFile->backupEndRequired = false;
6668                                         UpdateControlFile();
6669
6670                                         LWLockRelease(ControlFileLock);
6671                                 }
6672
6673                                 /*
6674                                  * Update shared recoveryLastRecPtr after this record has been
6675                                  * replayed.
6676                                  */
6677                                 SpinLockAcquire(&xlogctl->info_lck);
6678                                 xlogctl->recoveryLastRecPtr = EndRecPtr;
6679                                 SpinLockRelease(&xlogctl->info_lck);
6680
6681                                 LastRec = ReadRecPtr;
6682
6683                                 record = ReadRecord(NULL, LOG, false);
6684                         } while (record != NULL && recoveryContinue);
6685
6686                         /*
6687                          * end of main redo apply loop
6688                          */
6689
6690                         ereport(LOG,
6691                                         (errmsg("redo done at %X/%X",
6692                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
6693                         xtime = GetLatestXTime();
6694                         if (xtime)
6695                                 ereport(LOG,
6696                                          (errmsg("last completed transaction was at log time %s",
6697                                                          timestamptz_to_str(xtime))));
6698                         InRedo = false;
6699                 }
6700                 else
6701                 {
6702                         /* there are no WAL records following the checkpoint */
6703                         ereport(LOG,
6704                                         (errmsg("redo is not required")));
6705                 }
6706         }
6707
6708         /*
6709          * Kill WAL receiver, if it's still running, before we continue to write
6710          * the startup checkpoint record. It will trump over the checkpoint and
6711          * subsequent records if it's still alive when we start writing WAL.
6712          */
6713         ShutdownWalRcv();
6714
6715         /*
6716          * We don't need the latch anymore. It's not strictly necessary to disown
6717          * it, but let's do it for the sake of tidiness.
6718          */
6719         if (StandbyMode)
6720                 DisownLatch(&XLogCtl->recoveryWakeupLatch);
6721
6722         /*
6723          * We are now done reading the xlog from stream. Turn off streaming
6724          * recovery to force fetching the files (which would be required at end of
6725          * recovery, e.g., timeline history file) from archive or pg_xlog.
6726          */
6727         StandbyMode = false;
6728
6729         /*
6730          * Re-fetch the last valid or last applied record, so we can identify the
6731          * exact endpoint of what we consider the valid portion of WAL.
6732          */
6733         record = ReadRecord(&LastRec, PANIC, false);
6734         EndOfLog = EndRecPtr;
6735         XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);
6736
6737         /*
6738          * Complain if we did not roll forward far enough to render the backup
6739          * dump consistent.  Note: it is indeed okay to look at the local variable
6740          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
6741          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
6742          * advanced beyond the WAL we processed.
6743          */
6744         if (InRecovery &&
6745                 (XLByteLT(EndOfLog, minRecoveryPoint) ||
6746                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6747         {
6748                 if (reachedStopPoint)
6749                 {
6750                         /* stopped because of stop request */
6751                         ereport(FATAL,
6752                                         (errmsg("requested recovery stop point is before consistent recovery point")));
6753                 }
6754
6755                 /*
6756                  * Ran off end of WAL before reaching end-of-backup WAL record, or
6757                  * minRecoveryPoint. That's usually a bad sign, indicating that you
6758                  * tried to recover from an online backup but never called
6759                  * pg_stop_backup(), or you didn't archive all the WAL up to that
6760                  * point. However, this also happens in crash recovery, if the system
6761                  * crashes while an online backup is in progress. We must not treat
6762                  * that as an error, or the database will refuse to start up.
6763                  */
6764                 if (InArchiveRecovery || ControlFile->backupEndRequired)
6765                 {
6766                         if (ControlFile->backupEndRequired)
6767                                 ereport(FATAL,
6768                                                 (errmsg("WAL ends before end of online backup"),
6769                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
6770                         else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6771                                 ereport(FATAL,
6772                                                 (errmsg("WAL ends before end of online backup"),
6773                                                  errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
6774                         else
6775                                 ereport(FATAL,
6776                                           (errmsg("WAL ends before consistent recovery point")));
6777                 }
6778         }
6779
6780         /*
6781          * Consider whether we need to assign a new timeline ID.
6782          *
6783          * If we are doing an archive recovery, we always assign a new ID.      This
6784          * handles a couple of issues.  If we stopped short of the end of WAL
6785          * during recovery, then we are clearly generating a new timeline and must
6786          * assign it a unique new ID.  Even if we ran to the end, modifying the
6787          * current last segment is problematic because it may result in trying to
6788          * overwrite an already-archived copy of that segment, and we encourage
6789          * DBAs to make their archive_commands reject that.  We can dodge the
6790          * problem by making the new active segment have a new timeline ID.
6791          *
6792          * In a normal crash recovery, we can just extend the timeline we were in.
6793          */
6794         if (InArchiveRecovery)
6795         {
6796                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
6797                 ereport(LOG,
6798                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
6799                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
6800                                                          curFileTLI, endLogId, endLogSeg);
6801         }
6802
6803         /* Save the selected TimeLineID in shared memory, too */
6804         XLogCtl->ThisTimeLineID = ThisTimeLineID;
6805
6806         /*
6807          * We are now done reading the old WAL.  Turn off archive fetching if it
6808          * was active, and make a writable copy of the last WAL segment. (Note
6809          * that we also have a copy of the last block of the old WAL in readBuf;
6810          * we will use that below.)
6811          */
6812         if (InArchiveRecovery)
6813                 exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
6814
6815         /*
6816          * Prepare to write WAL starting at EndOfLog position, and init xlog
6817          * buffer cache using the block containing the last record from the
6818          * previous incarnation.
6819          */
6820         openLogId = endLogId;
6821         openLogSeg = endLogSeg;
6822         openLogFile = XLogFileOpen(openLogId, openLogSeg);
6823         openLogOff = 0;
6824         Insert = &XLogCtl->Insert;
6825         Insert->PrevRecord = LastRec;
6826         XLogCtl->xlblocks[0].xlogid = openLogId;
6827         XLogCtl->xlblocks[0].xrecoff =
6828                 ((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
6829
6830         /*
6831          * Tricky point here: readBuf contains the *last* block that the LastRec
6832          * record spans, not the one it starts in.      The last block is indeed the
6833          * one we want to use.
6834          */
6835         Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
6836         memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
6837         Insert->currpos = (char *) Insert->currpage +
6838                 (EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
6839
6840         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
6841
6842         XLogCtl->LogwrtResult = LogwrtResult;
6843
6844         XLogCtl->LogwrtRqst.Write = EndOfLog;
6845         XLogCtl->LogwrtRqst.Flush = EndOfLog;
6846
6847         freespace = INSERT_FREESPACE(Insert);
6848         if (freespace > 0)
6849         {
6850                 /* Make sure rest of page is zero */
6851                 MemSet(Insert->currpos, 0, freespace);
6852                 XLogCtl->Write.curridx = 0;
6853         }
6854         else
6855         {
6856                 /*
6857                  * Whenever LogwrtResult points to exactly the end of a page,
6858                  * Write.curridx must point to the *next* page (see XLogWrite()).
6859                  *
6860                  * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
6861                  * this is sufficient.  The first actual attempt to insert a log
6862                  * record will advance the insert state.
6863                  */
6864                 XLogCtl->Write.curridx = NextBufIdx(0);
6865         }
6866
6867         /* Pre-scan prepared transactions to find out the range of XIDs present */
6868         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
6869
6870         /*
6871          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
6872          * record before resource manager writes cleanup WAL records or checkpoint
6873          * record is written.
6874          */
6875         Insert->fullPageWrites = lastFullPageWrites;
6876         LocalSetXLogInsertAllowed();
6877         UpdateFullPageWrites();
6878         LocalXLogInsertAllowed = -1;
6879
6880         if (InRecovery)
6881         {
6882                 int                     rmid;
6883
6884                 /*
6885                  * Resource managers might need to write WAL records, eg, to record
6886                  * index cleanup actions.  So temporarily enable XLogInsertAllowed in
6887                  * this process only.
6888                  */
6889                 LocalSetXLogInsertAllowed();
6890
6891                 /*
6892                  * Allow resource managers to do any required cleanup.
6893                  */
6894                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6895                 {
6896                         if (RmgrTable[rmid].rm_cleanup != NULL)
6897                                 RmgrTable[rmid].rm_cleanup();
6898                 }
6899
6900                 /* Disallow XLogInsert again */
6901                 LocalXLogInsertAllowed = -1;
6902
6903                 /*
6904                  * Perform a checkpoint to update all our recovery activity to disk.
6905                  *
6906                  * Note that we write a shutdown checkpoint rather than an on-line
6907                  * one. This is not particularly critical, but since we may be
6908                  * assigning a new TLI, using a shutdown checkpoint allows us to have
6909                  * the rule that TLI only changes in shutdown checkpoints, which
6910                  * allows some extra error checking in xlog_redo.
6911                  */
6912                 if (bgwriterLaunched)
6913                         RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
6914                                                           CHECKPOINT_IMMEDIATE |
6915                                                           CHECKPOINT_WAIT);
6916                 else
6917                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
6918
6919                 /*
6920                  * And finally, execute the recovery_end_command, if any.
6921                  */
6922                 if (recoveryEndCommand)
6923                         ExecuteRecoveryCommand(recoveryEndCommand,
6924                                                                    "recovery_end_command",
6925                                                                    true);
6926         }
6927
6928         /*
6929          * Preallocate additional log files, if wanted.
6930          */
6931         PreallocXlogFiles(EndOfLog);
6932
6933         /*
6934          * Reset initial contents of unlogged relations.  This has to be done
6935          * AFTER recovery is complete so that any unlogged relations created
6936          * during recovery also get picked up.
6937          */
6938         if (InRecovery)
6939                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
6940
6941         /*
6942          * Okay, we're officially UP.
6943          */
6944         InRecovery = false;
6945
6946         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6947         ControlFile->state = DB_IN_PRODUCTION;
6948         ControlFile->time = (pg_time_t) time(NULL);
6949         UpdateControlFile();
6950         LWLockRelease(ControlFileLock);
6951
6952         /* start the archive_timeout timer running */
6953         XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
6954
6955         /* also initialize latestCompletedXid, to nextXid - 1 */
6956         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
6957         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
6958         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
6959         LWLockRelease(ProcArrayLock);
6960
6961         /*
6962          * Start up the commit log and subtrans, if not already done for hot
6963          * standby.
6964          */
6965         if (standbyState == STANDBY_DISABLED)
6966         {
6967                 StartupCLOG();
6968                 StartupSUBTRANS(oldestActiveXID);
6969         }
6970
6971         /*
6972          * Perform end of recovery actions for any SLRUs that need it.
6973          */
6974         StartupMultiXact();
6975         TrimCLOG();
6976
6977         /* Reload shared-memory state for prepared transactions */
6978         RecoverPreparedTransactions();
6979
6980         /*
6981          * Shutdown the recovery environment. This must occur after
6982          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
6983          */
6984         if (standbyState != STANDBY_DISABLED)
6985                 ShutdownRecoveryTransactionEnvironment();
6986
6987         /* Shut down readFile facility, free space */
6988         if (readFile >= 0)
6989         {
6990                 close(readFile);
6991                 readFile = -1;
6992         }
6993         if (readBuf)
6994         {
6995                 free(readBuf);
6996                 readBuf = NULL;
6997         }
6998         if (readRecordBuf)
6999         {
7000                 free(readRecordBuf);
7001                 readRecordBuf = NULL;
7002                 readRecordBufSize = 0;
7003         }
7004
7005         /*
7006          * If any of the critical GUCs have changed, log them before we allow
7007          * backends to write WAL.
7008          */
7009         LocalSetXLogInsertAllowed();
7010         XLogReportParameters();
7011
7012         /*
7013          * All done.  Allow backends to write WAL.      (Although the bool flag is
7014          * probably atomic in itself, we use the info_lck here to ensure that
7015          * there are no race conditions concerning visibility of other recent
7016          * updates to shared memory.)
7017          */
7018         {
7019                 /* use volatile pointer to prevent code rearrangement */
7020                 volatile XLogCtlData *xlogctl = XLogCtl;
7021
7022                 SpinLockAcquire(&xlogctl->info_lck);
7023                 xlogctl->SharedRecoveryInProgress = false;
7024                 SpinLockRelease(&xlogctl->info_lck);
7025         }
7026 }
7027
7028 /*
7029  * Checks if recovery has reached a consistent state. When consistency is
7030  * reached and we have a valid starting standby snapshot, tell postmaster
7031  * that it can start accepting read-only connections.
7032  */
7033 static void
7034 CheckRecoveryConsistency(void)
7035 {
7036         /*
7037          * During crash recovery, we don't reach a consistent state until we've
7038          * replayed all the WAL.
7039          */
7040         if (XLogRecPtrIsInvalid(minRecoveryPoint))
7041                 return;
7042
7043         /*
7044          * Have we passed our safe starting point?
7045          */
7046         if (!reachedConsistency &&
7047                 XLByteLE(minRecoveryPoint, EndRecPtr) &&
7048                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7049         {
7050                 /*
7051                  * Check to see if the XLOG sequence contained any unresolved
7052                  * references to uninitialized pages.
7053                  */
7054                 XLogCheckInvalidPages();
7055
7056                 reachedConsistency = true;
7057                 ereport(LOG,
7058                                 (errmsg("consistent recovery state reached at %X/%X",
7059                                                 EndRecPtr.xlogid, EndRecPtr.xrecoff)));
7060         }
7061
7062         /*
7063          * Have we got a valid starting snapshot that will allow queries to be
7064          * run? If so, we can tell postmaster that the database is consistent now,
7065          * enabling connections.
7066          */
7067         if (standbyState == STANDBY_SNAPSHOT_READY &&
7068                 !LocalHotStandbyActive &&
7069                 reachedConsistency &&
7070                 IsUnderPostmaster)
7071         {
7072                 /* use volatile pointer to prevent code rearrangement */
7073                 volatile XLogCtlData *xlogctl = XLogCtl;
7074
7075                 SpinLockAcquire(&xlogctl->info_lck);
7076                 xlogctl->SharedHotStandbyActive = true;
7077                 SpinLockRelease(&xlogctl->info_lck);
7078
7079                 LocalHotStandbyActive = true;
7080
7081                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7082         }
7083 }
7084
7085 /*
7086  * Is the system still in recovery?
7087  *
7088  * Unlike testing InRecovery, this works in any process that's connected to
7089  * shared memory.
7090  *
7091  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7092  * variables the first time we see that recovery is finished.
7093  */
7094 bool
7095 RecoveryInProgress(void)
7096 {
7097         /*
7098          * We check shared state each time only until we leave recovery mode. We
7099          * can't re-enter recovery, so there's no need to keep checking after the
7100          * shared variable has once been seen false.
7101          */
7102         if (!LocalRecoveryInProgress)
7103                 return false;
7104         else
7105         {
7106                 /* use volatile pointer to prevent code rearrangement */
7107                 volatile XLogCtlData *xlogctl = XLogCtl;
7108
7109                 /* spinlock is essential on machines with weak memory ordering! */
7110                 SpinLockAcquire(&xlogctl->info_lck);
7111                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
7112                 SpinLockRelease(&xlogctl->info_lck);
7113
7114                 /*
7115                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7116                  * is finished. InitPostgres() relies upon this behaviour to ensure
7117                  * that InitXLOGAccess() is called at backend startup.  (If you change
7118                  * this, see also LocalSetXLogInsertAllowed.)
7119                  */
7120                 if (!LocalRecoveryInProgress)
7121                         InitXLOGAccess();
7122
7123                 return LocalRecoveryInProgress;
7124         }
7125 }
7126
7127 /*
7128  * Is HotStandby active yet? This is only important in special backends
7129  * since normal backends won't ever be able to connect until this returns
7130  * true. Postmaster knows this by way of signal, not via shared memory.
7131  *
7132  * Unlike testing standbyState, this works in any process that's connected to
7133  * shared memory.
7134  */
7135 bool
7136 HotStandbyActive(void)
7137 {
7138         /*
7139          * We check shared state each time only until Hot Standby is active. We
7140          * can't de-activate Hot Standby, so there's no need to keep checking
7141          * after the shared variable has once been seen true.
7142          */
7143         if (LocalHotStandbyActive)
7144                 return true;
7145         else
7146         {
7147                 /* use volatile pointer to prevent code rearrangement */
7148                 volatile XLogCtlData *xlogctl = XLogCtl;
7149
7150                 /* spinlock is essential on machines with weak memory ordering! */
7151                 SpinLockAcquire(&xlogctl->info_lck);
7152                 LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
7153                 SpinLockRelease(&xlogctl->info_lck);
7154
7155                 return LocalHotStandbyActive;
7156         }
7157 }
7158
7159 /*
7160  * Is this process allowed to insert new WAL records?
7161  *
7162  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7163  * But we also have provisions for forcing the result "true" or "false"
7164  * within specific processes regardless of the global state.
7165  */
7166 bool
7167 XLogInsertAllowed(void)
7168 {
7169         /*
7170          * If value is "unconditionally true" or "unconditionally false", just
7171          * return it.  This provides the normal fast path once recovery is known
7172          * done.
7173          */
7174         if (LocalXLogInsertAllowed >= 0)
7175                 return (bool) LocalXLogInsertAllowed;
7176
7177         /*
7178          * Else, must check to see if we're still in recovery.
7179          */
7180         if (RecoveryInProgress())
7181                 return false;
7182
7183         /*
7184          * On exit from recovery, reset to "unconditionally true", since there is
7185          * no need to keep checking.
7186          */
7187         LocalXLogInsertAllowed = 1;
7188         return true;
7189 }
7190
7191 /*
7192  * Make XLogInsertAllowed() return true in the current process only.
7193  *
7194  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7195  * and even call LocalSetXLogInsertAllowed() again after that.
7196  */
7197 static void
7198 LocalSetXLogInsertAllowed(void)
7199 {
7200         Assert(LocalXLogInsertAllowed == -1);
7201         LocalXLogInsertAllowed = 1;
7202
7203         /* Initialize as RecoveryInProgress() would do when switching state */
7204         InitXLOGAccess();
7205 }
7206
7207 /*
7208  * Subroutine to try to fetch and validate a prior checkpoint record.
7209  *
7210  * whichChkpt identifies the checkpoint (merely for reporting purposes).
7211  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7212  */
7213 static XLogRecord *
7214 ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
7215 {
7216         XLogRecord *record;
7217
7218         if (!XRecOffIsValid(RecPtr.xrecoff))
7219         {
7220                 switch (whichChkpt)
7221                 {
7222                         case 1:
7223                                 ereport(LOG,
7224                                 (errmsg("invalid primary checkpoint link in control file")));
7225                                 break;
7226                         case 2:
7227                                 ereport(LOG,
7228                                                 (errmsg("invalid secondary checkpoint link in control file")));
7229                                 break;
7230                         default:
7231                                 ereport(LOG,
7232                                    (errmsg("invalid checkpoint link in backup_label file")));
7233                                 break;
7234                 }
7235                 return NULL;
7236         }
7237
7238         record = ReadRecord(&RecPtr, LOG, true);
7239
7240         if (record == NULL)
7241         {
7242                 switch (whichChkpt)
7243                 {
7244                         case 1:
7245                                 ereport(LOG,
7246                                                 (errmsg("invalid primary checkpoint record")));
7247                                 break;
7248                         case 2:
7249                                 ereport(LOG,
7250                                                 (errmsg("invalid secondary checkpoint record")));
7251                                 break;
7252                         default:
7253                                 ereport(LOG,
7254                                                 (errmsg("invalid checkpoint record")));
7255                                 break;
7256                 }
7257                 return NULL;
7258         }
7259         if (record->xl_rmid != RM_XLOG_ID)
7260         {
7261                 switch (whichChkpt)
7262                 {
7263                         case 1:
7264                                 ereport(LOG,
7265                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
7266                                 break;
7267                         case 2:
7268                                 ereport(LOG,
7269                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
7270                                 break;
7271                         default:
7272                                 ereport(LOG,
7273                                 (errmsg("invalid resource manager ID in checkpoint record")));
7274                                 break;
7275                 }
7276                 return NULL;
7277         }
7278         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
7279                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
7280         {
7281                 switch (whichChkpt)
7282                 {
7283                         case 1:
7284                                 ereport(LOG,
7285                                    (errmsg("invalid xl_info in primary checkpoint record")));
7286                                 break;
7287                         case 2:
7288                                 ereport(LOG,
7289                                  (errmsg("invalid xl_info in secondary checkpoint record")));
7290                                 break;
7291                         default:
7292                                 ereport(LOG,
7293                                                 (errmsg("invalid xl_info in checkpoint record")));
7294                                 break;
7295                 }
7296                 return NULL;
7297         }
7298         if (record->xl_len != sizeof(CheckPoint) ||
7299                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
7300         {
7301                 switch (whichChkpt)
7302                 {
7303                         case 1:
7304                                 ereport(LOG,
7305                                         (errmsg("invalid length of primary checkpoint record")));
7306                                 break;
7307                         case 2:
7308                                 ereport(LOG,
7309                                   (errmsg("invalid length of secondary checkpoint record")));
7310                                 break;
7311                         default:
7312                                 ereport(LOG,
7313                                                 (errmsg("invalid length of checkpoint record")));
7314                                 break;
7315                 }
7316                 return NULL;
7317         }
7318         return record;
7319 }
7320
7321 /*
7322  * This must be called during startup of a backend process, except that
7323  * it need not be called in a standalone backend (which does StartupXLOG
7324  * instead).  We need to initialize the local copies of ThisTimeLineID and
7325  * RedoRecPtr.
7326  *
7327  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
7328  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
7329  * unnecessary however, since the postmaster itself never touches XLOG anyway.
7330  */
7331 void
7332 InitXLOGAccess(void)
7333 {
7334         /* ThisTimeLineID doesn't change so we need no lock to copy it */
7335         ThisTimeLineID = XLogCtl->ThisTimeLineID;
7336         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
7337
7338         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
7339         (void) GetRedoRecPtr();
7340 }
7341
7342 /*
7343  * Once spawned, a backend may update its local RedoRecPtr from
7344  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
7345  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
7346  */
7347 XLogRecPtr
7348 GetRedoRecPtr(void)
7349 {
7350         /* use volatile pointer to prevent code rearrangement */
7351         volatile XLogCtlData *xlogctl = XLogCtl;
7352
7353         SpinLockAcquire(&xlogctl->info_lck);
7354         Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
7355         RedoRecPtr = xlogctl->Insert.RedoRecPtr;
7356         SpinLockRelease(&xlogctl->info_lck);
7357
7358         return RedoRecPtr;
7359 }
7360
7361 /*
7362  * GetInsertRecPtr -- Returns the current insert position.
7363  *
7364  * NOTE: The value *actually* returned is the position of the last full
7365  * xlog page. It lags behind the real insert position by at most 1 page.
7366  * For that, we don't need to acquire WALInsertLock which can be quite
7367  * heavily contended, and an approximation is enough for the current
7368  * usage of this function.
7369  */
7370 XLogRecPtr
7371 GetInsertRecPtr(void)
7372 {
7373         /* use volatile pointer to prevent code rearrangement */
7374         volatile XLogCtlData *xlogctl = XLogCtl;
7375         XLogRecPtr      recptr;
7376
7377         SpinLockAcquire(&xlogctl->info_lck);
7378         recptr = xlogctl->LogwrtRqst.Write;
7379         SpinLockRelease(&xlogctl->info_lck);
7380
7381         return recptr;
7382 }
7383
7384 /*
7385  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
7386  * position known to be fsync'd to disk.
7387  */
7388 XLogRecPtr
7389 GetFlushRecPtr(void)
7390 {
7391         /* use volatile pointer to prevent code rearrangement */
7392         volatile XLogCtlData *xlogctl = XLogCtl;
7393         XLogRecPtr      recptr;
7394
7395         SpinLockAcquire(&xlogctl->info_lck);
7396         recptr = xlogctl->LogwrtResult.Flush;
7397         SpinLockRelease(&xlogctl->info_lck);
7398
7399         return recptr;
7400 }
7401
7402 /*
7403  * Get the time of the last xlog segment switch
7404  */
7405 pg_time_t
7406 GetLastSegSwitchTime(void)
7407 {
7408         pg_time_t       result;
7409
7410         /* Need WALWriteLock, but shared lock is sufficient */
7411         LWLockAcquire(WALWriteLock, LW_SHARED);
7412         result = XLogCtl->Write.lastSegSwitchTime;
7413         LWLockRelease(WALWriteLock);
7414
7415         return result;
7416 }
7417
7418 /*
7419  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
7420  *
7421  * This is exported for use by code that would like to have 64-bit XIDs.
7422  * We don't really support such things, but all XIDs within the system
7423  * can be presumed "close to" the result, and thus the epoch associated
7424  * with them can be determined.
7425  */
7426 void
7427 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
7428 {
7429         uint32          ckptXidEpoch;
7430         TransactionId ckptXid;
7431         TransactionId nextXid;
7432
7433         /* Must read checkpoint info first, else have race condition */
7434         {
7435                 /* use volatile pointer to prevent code rearrangement */
7436                 volatile XLogCtlData *xlogctl = XLogCtl;
7437
7438                 SpinLockAcquire(&xlogctl->info_lck);
7439                 ckptXidEpoch = xlogctl->ckptXidEpoch;
7440                 ckptXid = xlogctl->ckptXid;
7441                 SpinLockRelease(&xlogctl->info_lck);
7442         }
7443
7444         /* Now fetch current nextXid */
7445         nextXid = ReadNewTransactionId();
7446
7447         /*
7448          * nextXid is certainly logically later than ckptXid.  So if it's
7449          * numerically less, it must have wrapped into the next epoch.
7450          */
7451         if (nextXid < ckptXid)
7452                 ckptXidEpoch++;
7453
7454         *xid = nextXid;
7455         *epoch = ckptXidEpoch;
7456 }
7457
7458 /*
7459  * GetRecoveryTargetTLI - get the recovery target timeline ID
7460  */
7461 TimeLineID
7462 GetRecoveryTargetTLI(void)
7463 {
7464         /* RecoveryTargetTLI doesn't change so we need no lock to copy it */
7465         return XLogCtl->RecoveryTargetTLI;
7466 }
7467
7468 /*
7469  * This must be called ONCE during postmaster or standalone-backend shutdown
7470  */
7471 void
7472 ShutdownXLOG(int code, Datum arg)
7473 {
7474         ereport(LOG,
7475                         (errmsg("shutting down")));
7476
7477         if (RecoveryInProgress())
7478                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7479         else
7480         {
7481                 /*
7482                  * If archiving is enabled, rotate the last XLOG file so that all the
7483                  * remaining records are archived (postmaster wakes up the archiver
7484                  * process one more time at the end of shutdown). The checkpoint
7485                  * record will go to the next XLOG file and won't be archived (yet).
7486                  */
7487                 if (XLogArchivingActive() && XLogArchiveCommandSet())
7488                         RequestXLogSwitch();
7489
7490                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7491         }
7492         ShutdownCLOG();
7493         ShutdownSUBTRANS();
7494         ShutdownMultiXact();
7495
7496         ereport(LOG,
7497                         (errmsg("database system is shut down")));
7498 }
7499
7500 /*
7501  * Log start of a checkpoint.
7502  */
7503 static void
7504 LogCheckpointStart(int flags, bool restartpoint)
7505 {
7506         const char *msg;
7507
7508         /*
7509          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
7510          * the main message, but what about all the flags?
7511          */
7512         if (restartpoint)
7513                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
7514         else
7515                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
7516
7517         elog(LOG, msg,
7518                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
7519                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
7520                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
7521                  (flags & CHECKPOINT_FORCE) ? " force" : "",
7522                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
7523                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
7524                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
7525 }
7526
7527 /*
7528  * Log end of a checkpoint.
7529  */
7530 static void
7531 LogCheckpointEnd(bool restartpoint)
7532 {
7533         long            write_secs,
7534                                 sync_secs,
7535                                 total_secs,
7536                                 longest_secs,
7537                                 average_secs;
7538         int                     write_usecs,
7539                                 sync_usecs,
7540                                 total_usecs,
7541                                 longest_usecs,
7542                                 average_usecs;
7543         uint64          average_sync_time;
7544
7545         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
7546
7547         TimestampDifference(CheckpointStats.ckpt_write_t,
7548                                                 CheckpointStats.ckpt_sync_t,
7549                                                 &write_secs, &write_usecs);
7550
7551         TimestampDifference(CheckpointStats.ckpt_sync_t,
7552                                                 CheckpointStats.ckpt_sync_end_t,
7553                                                 &sync_secs, &sync_usecs);
7554
7555         /* Accumulate checkpoint timing summary data, in milliseconds. */
7556         BgWriterStats.m_checkpoint_write_time +=
7557                 write_secs * 1000 + write_usecs / 1000;
7558         BgWriterStats.m_checkpoint_sync_time +=
7559                 sync_secs * 1000 + sync_usecs / 1000;
7560
7561         /*
7562          * All of the published timing statistics are accounted for.  Only
7563          * continue if a log message is to be written.
7564          */
7565         if (!log_checkpoints)
7566                 return;
7567
7568         TimestampDifference(CheckpointStats.ckpt_start_t,
7569                                                 CheckpointStats.ckpt_end_t,
7570                                                 &total_secs, &total_usecs);
7571
7572         /*
7573          * Timing values returned from CheckpointStats are in microseconds.
7574          * Convert to the second plus microsecond form that TimestampDifference
7575          * returns for homogeneous printing.
7576          */
7577         longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
7578         longest_usecs = CheckpointStats.ckpt_longest_sync -
7579                 (uint64) longest_secs *1000000;
7580
7581         average_sync_time = 0;
7582         if (CheckpointStats.ckpt_sync_rels > 0)
7583                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
7584                         CheckpointStats.ckpt_sync_rels;
7585         average_secs = (long) (average_sync_time / 1000000);
7586         average_usecs = average_sync_time - (uint64) average_secs *1000000;
7587
7588         if (restartpoint)
7589                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
7590                          "%d transaction log file(s) added, %d removed, %d recycled; "
7591                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7592                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7593                          CheckpointStats.ckpt_bufs_written,
7594                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7595                          CheckpointStats.ckpt_segs_added,
7596                          CheckpointStats.ckpt_segs_removed,
7597                          CheckpointStats.ckpt_segs_recycled,
7598                          write_secs, write_usecs / 1000,
7599                          sync_secs, sync_usecs / 1000,
7600                          total_secs, total_usecs / 1000,
7601                          CheckpointStats.ckpt_sync_rels,
7602                          longest_secs, longest_usecs / 1000,
7603                          average_secs, average_usecs / 1000);
7604         else
7605                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
7606                          "%d transaction log file(s) added, %d removed, %d recycled; "
7607                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
7608                          "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7609                          CheckpointStats.ckpt_bufs_written,
7610                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7611                          CheckpointStats.ckpt_segs_added,
7612                          CheckpointStats.ckpt_segs_removed,
7613                          CheckpointStats.ckpt_segs_recycled,
7614                          write_secs, write_usecs / 1000,
7615                          sync_secs, sync_usecs / 1000,
7616                          total_secs, total_usecs / 1000,
7617                          CheckpointStats.ckpt_sync_rels,
7618                          longest_secs, longest_usecs / 1000,
7619                          average_secs, average_usecs / 1000);
7620 }
7621
7622 /*
7623  * Perform a checkpoint --- either during shutdown, or on-the-fly
7624  *
7625  * flags is a bitwise OR of the following:
7626  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
7627  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
7628  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
7629  *              ignoring checkpoint_completion_target parameter.
7630  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
7631  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
7632  *              CHECKPOINT_END_OF_RECOVERY).
7633  *
7634  * Note: flags contains other bits, of interest here only for logging purposes.
7635  * In particular note that this routine is synchronous and does not pay
7636  * attention to CHECKPOINT_WAIT.
7637  */
7638 void
7639 CreateCheckPoint(int flags)
7640 {
7641         bool            shutdown;
7642         CheckPoint      checkPoint;
7643         XLogRecPtr      recptr;
7644         XLogCtlInsert *Insert = &XLogCtl->Insert;
7645         XLogRecData rdata;
7646         uint32          freespace;
7647         uint32          _logId;
7648         uint32          _logSeg;
7649         uint32          redo_logId;
7650         uint32          redo_logSeg;
7651         uint32          insert_logId;
7652         uint32          insert_logSeg;
7653         TransactionId *inCommitXids;
7654         int                     nInCommit;
7655
7656         /*
7657          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
7658          * issued at a different time.
7659          */
7660         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
7661                 shutdown = true;
7662         else
7663                 shutdown = false;
7664
7665         /* sanity check */
7666         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
7667                 elog(ERROR, "can't create a checkpoint during recovery");
7668
7669         /*
7670          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
7671          * (This is just pro forma, since in the present system structure there is
7672          * only one process that is allowed to issue checkpoints at any given
7673          * time.)
7674          */
7675         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
7676
7677         /*
7678          * Prepare to accumulate statistics.
7679          *
7680          * Note: because it is possible for log_checkpoints to change while a
7681          * checkpoint proceeds, we always accumulate stats, even if
7682          * log_checkpoints is currently off.
7683          */
7684         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7685         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7686
7687         /*
7688          * Use a critical section to force system panic if we have trouble.
7689          */
7690         START_CRIT_SECTION();
7691
7692         if (shutdown)
7693         {
7694                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7695                 ControlFile->state = DB_SHUTDOWNING;
7696                 ControlFile->time = (pg_time_t) time(NULL);
7697                 UpdateControlFile();
7698                 LWLockRelease(ControlFileLock);
7699         }
7700
7701         /*
7702          * Let smgr prepare for checkpoint; this has to happen before we determine
7703          * the REDO pointer.  Note that smgr must not do anything that'd have to
7704          * be undone if we decide no checkpoint is needed.
7705          */
7706         smgrpreckpt();
7707
7708         /* Begin filling in the checkpoint WAL record */
7709         MemSet(&checkPoint, 0, sizeof(checkPoint));
7710         checkPoint.time = (pg_time_t) time(NULL);
7711
7712         /*
7713          * For Hot Standby, derive the oldestActiveXid before we fix the redo
7714          * pointer. This allows us to begin accumulating changes to assemble our
7715          * starting snapshot of locks and transactions.
7716          */
7717         if (!shutdown && XLogStandbyInfoActive())
7718                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
7719         else
7720                 checkPoint.oldestActiveXid = InvalidTransactionId;
7721
7722         /*
7723          * We must hold WALInsertLock while examining insert state to determine
7724          * the checkpoint REDO pointer.
7725          */
7726         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7727
7728         /*
7729          * If this isn't a shutdown or forced checkpoint, and we have not switched
7730          * to the next WAL file since the start of the last checkpoint, skip the
7731          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
7732          * when the system is idle. That wastes log space, and more importantly it
7733          * exposes us to possible loss of both current and previous checkpoint
7734          * records if the machine crashes just as we're writing the update.
7735          * (Perhaps it'd make even more sense to checkpoint only when the previous
7736          * checkpoint record is in a different xlog page?)
7737          *
7738          * While holding the WALInsertLock we find the current WAL insertion point
7739          * and compare that with the starting point of the last checkpoint, which
7740          * is the redo pointer. We use the redo pointer because the start and end
7741          * points of a checkpoint can be hundreds of files apart on large systems
7742          * when checkpoint writes are spread out over time.
7743          */
7744         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
7745                                   CHECKPOINT_FORCE)) == 0)
7746         {
7747                 XLogRecPtr      curInsert;
7748
7749                 INSERT_RECPTR(curInsert, Insert, Insert->curridx);
7750                 XLByteToSeg(curInsert, insert_logId, insert_logSeg);
7751                 XLByteToSeg(ControlFile->checkPointCopy.redo, redo_logId, redo_logSeg);
7752                 if (insert_logId == redo_logId &&
7753                         insert_logSeg == redo_logSeg)
7754                 {
7755                         LWLockRelease(WALInsertLock);
7756                         LWLockRelease(CheckpointLock);
7757                         END_CRIT_SECTION();
7758                         return;
7759                 }
7760         }
7761
7762         /*
7763          * An end-of-recovery checkpoint is created before anyone is allowed to
7764          * write WAL. To allow us to write the checkpoint record, temporarily
7765          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
7766          * initialized, which we need here and in AdvanceXLInsertBuffer.)
7767          */
7768         if (flags & CHECKPOINT_END_OF_RECOVERY)
7769                 LocalSetXLogInsertAllowed();
7770
7771         checkPoint.ThisTimeLineID = ThisTimeLineID;
7772         checkPoint.fullPageWrites = Insert->fullPageWrites;
7773
7774         /*
7775          * Compute new REDO record ptr = location of next XLOG record.
7776          *
7777          * NB: this is NOT necessarily where the checkpoint record itself will be,
7778          * since other backends may insert more XLOG records while we're off doing
7779          * the buffer flush work.  Those XLOG records are logically after the
7780          * checkpoint, even though physically before it.  Got that?
7781          */
7782         freespace = INSERT_FREESPACE(Insert);
7783         if (freespace < SizeOfXLogRecord)
7784         {
7785                 (void) AdvanceXLInsertBuffer(false);
7786                 /* OK to ignore update return flag, since we will do flush anyway */
7787                 freespace = INSERT_FREESPACE(Insert);
7788         }
7789         INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
7790
7791         /*
7792          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
7793          * must be done while holding the insert lock AND the info_lck.
7794          *
7795          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
7796          * pointing past where it really needs to point.  This is okay; the only
7797          * consequence is that XLogInsert might back up whole buffers that it
7798          * didn't really need to.  We can't postpone advancing RedoRecPtr because
7799          * XLogInserts that happen while we are dumping buffers must assume that
7800          * their buffer changes are not included in the checkpoint.
7801          */
7802         {
7803                 /* use volatile pointer to prevent code rearrangement */
7804                 volatile XLogCtlData *xlogctl = XLogCtl;
7805
7806                 SpinLockAcquire(&xlogctl->info_lck);
7807                 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
7808                 SpinLockRelease(&xlogctl->info_lck);
7809         }
7810
7811         /*
7812          * Now we can release WAL insert lock, allowing other xacts to proceed
7813          * while we are flushing disk buffers.
7814          */
7815         LWLockRelease(WALInsertLock);
7816
7817         /*
7818          * If enabled, log checkpoint start.  We postpone this until now so as not
7819          * to log anything if we decided to skip the checkpoint.
7820          */
7821         if (log_checkpoints)
7822                 LogCheckpointStart(flags, false);
7823
7824         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
7825
7826         /*
7827          * Before flushing data, we must wait for any transactions that are
7828          * currently in their commit critical sections.  If an xact inserted its
7829          * commit record into XLOG just before the REDO point, then a crash
7830          * restart from the REDO point would not replay that record, which means
7831          * that our flushing had better include the xact's update of pg_clog.  So
7832          * we wait till he's out of his commit critical section before proceeding.
7833          * See notes in RecordTransactionCommit().
7834          *
7835          * Because we've already released WALInsertLock, this test is a bit fuzzy:
7836          * it is possible that we will wait for xacts we didn't really need to
7837          * wait for.  But the delay should be short and it seems better to make
7838          * checkpoint take a bit longer than to hold locks longer than necessary.
7839          * (In fact, the whole reason we have this issue is that xact.c does
7840          * commit record XLOG insertion and clog update as two separate steps
7841          * protected by different locks, but again that seems best on grounds of
7842          * minimizing lock contention.)
7843          *
7844          * A transaction that has not yet set inCommit when we look cannot be at
7845          * risk, since he's not inserted his commit record yet; and one that's
7846          * already cleared it is not at risk either, since he's done fixing clog
7847          * and we will correctly flush the update below.  So we cannot miss any
7848          * xacts we need to wait for.
7849          */
7850         nInCommit = GetTransactionsInCommit(&inCommitXids);
7851         if (nInCommit > 0)
7852         {
7853                 do
7854                 {
7855                         pg_usleep(10000L);      /* wait for 10 msec */
7856                 } while (HaveTransactionsInCommit(inCommitXids, nInCommit));
7857         }
7858         pfree(inCommitXids);
7859
7860         /*
7861          * Get the other info we need for the checkpoint record.
7862          */
7863         LWLockAcquire(XidGenLock, LW_SHARED);
7864         checkPoint.nextXid = ShmemVariableCache->nextXid;
7865         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
7866         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
7867         LWLockRelease(XidGenLock);
7868
7869         /* Increase XID epoch if we've wrapped around since last checkpoint */
7870         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
7871         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
7872                 checkPoint.nextXidEpoch++;
7873
7874         LWLockAcquire(OidGenLock, LW_SHARED);
7875         checkPoint.nextOid = ShmemVariableCache->nextOid;
7876         if (!shutdown)
7877                 checkPoint.nextOid += ShmemVariableCache->oidCount;
7878         LWLockRelease(OidGenLock);
7879
7880         MultiXactGetCheckptMulti(shutdown,
7881                                                          &checkPoint.nextMulti,
7882                                                          &checkPoint.nextMultiOffset);
7883
7884         /*
7885          * Having constructed the checkpoint record, ensure all shmem disk buffers
7886          * and commit-log buffers are flushed to disk.
7887          *
7888          * This I/O could fail for various reasons.  If so, we will fail to
7889          * complete the checkpoint, but there is no reason to force a system
7890          * panic. Accordingly, exit critical section while doing it.
7891          */
7892         END_CRIT_SECTION();
7893
7894         CheckPointGuts(checkPoint.redo, flags);
7895
7896         /*
7897          * Take a snapshot of running transactions and write this to WAL. This
7898          * allows us to reconstruct the state of running transactions during
7899          * archive recovery, if required. Skip, if this info disabled.
7900          *
7901          * If we are shutting down, or Startup process is completing crash
7902          * recovery we don't need to write running xact data.
7903          *
7904          * Update checkPoint.nextXid since we have a later value
7905          */
7906         if (!shutdown && XLogStandbyInfoActive())
7907                 LogStandbySnapshot(&checkPoint.nextXid);
7908
7909         START_CRIT_SECTION();
7910
7911         /*
7912          * Now insert the checkpoint record into XLOG.
7913          */
7914         rdata.data = (char *) (&checkPoint);
7915         rdata.len = sizeof(checkPoint);
7916         rdata.buffer = InvalidBuffer;
7917         rdata.next = NULL;
7918
7919         recptr = XLogInsert(RM_XLOG_ID,
7920                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
7921                                                 XLOG_CHECKPOINT_ONLINE,
7922                                                 &rdata);
7923
7924         XLogFlush(recptr);
7925
7926         /*
7927          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
7928          * overwritten at next startup.  No-one should even try, this just allows
7929          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
7930          * to just temporarily disable writing until the system has exited
7931          * recovery.
7932          */
7933         if (shutdown)
7934         {
7935                 if (flags & CHECKPOINT_END_OF_RECOVERY)
7936                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
7937                 else
7938                         LocalXLogInsertAllowed = 0; /* never again write WAL */
7939         }
7940
7941         /*
7942          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
7943          * = end of actual checkpoint record.
7944          */
7945         if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
7946                 ereport(PANIC,
7947                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
7948
7949         /*
7950          * Select point at which we can truncate the log, which we base on the
7951          * prior checkpoint's earliest info.
7952          */
7953         XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
7954
7955         /*
7956          * Update the control file.
7957          */
7958         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7959         if (shutdown)
7960                 ControlFile->state = DB_SHUTDOWNED;
7961         ControlFile->prevCheckPoint = ControlFile->checkPoint;
7962         ControlFile->checkPoint = ProcLastRecPtr;
7963         ControlFile->checkPointCopy = checkPoint;
7964         ControlFile->time = (pg_time_t) time(NULL);
7965         /* crash recovery should always recover to the end of WAL */
7966         MemSet(&ControlFile->minRecoveryPoint, 0, sizeof(XLogRecPtr));
7967         UpdateControlFile();
7968         LWLockRelease(ControlFileLock);
7969
7970         /* Update shared-memory copy of checkpoint XID/epoch */
7971         {
7972                 /* use volatile pointer to prevent code rearrangement */
7973                 volatile XLogCtlData *xlogctl = XLogCtl;
7974
7975                 SpinLockAcquire(&xlogctl->info_lck);
7976                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
7977                 xlogctl->ckptXid = checkPoint.nextXid;
7978                 SpinLockRelease(&xlogctl->info_lck);
7979         }
7980
7981         /*
7982          * We are now done with critical updates; no need for system panic if we
7983          * have trouble while fooling with old log segments.
7984          */
7985         END_CRIT_SECTION();
7986
7987         /*
7988          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
7989          */
7990         smgrpostckpt();
7991
7992         /*
7993          * Delete old log files (those no longer needed even for previous
7994          * checkpoint or the standbys in XLOG streaming).
7995          */
7996         if (_logId || _logSeg)
7997         {
7998                 KeepLogSeg(recptr, &_logId, &_logSeg);
7999                 PrevLogSeg(_logId, _logSeg);
8000                 RemoveOldXlogFiles(_logId, _logSeg, recptr);
8001         }
8002
8003         /*
8004          * Make more log segments if needed.  (Do this after recycling old log
8005          * segments, since that may supply some of the needed files.)
8006          */
8007         if (!shutdown)
8008                 PreallocXlogFiles(recptr);
8009
8010         /*
8011          * Truncate pg_subtrans if possible.  We can throw away all data before
8012          * the oldest XMIN of any running transaction.  No future transaction will
8013          * attempt to reference any pg_subtrans entry older than that (see Asserts
8014          * in subtrans.c).      During recovery, though, we mustn't do this because
8015          * StartupSUBTRANS hasn't been called yet.
8016          */
8017         if (!RecoveryInProgress())
8018                 TruncateSUBTRANS(GetOldestXmin(true, false));
8019
8020         /* Real work is done, but log and update stats before releasing lock. */
8021         LogCheckpointEnd(false);
8022
8023         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8024                                                                          NBuffers,
8025                                                                          CheckpointStats.ckpt_segs_added,
8026                                                                          CheckpointStats.ckpt_segs_removed,
8027                                                                          CheckpointStats.ckpt_segs_recycled);
8028
8029         LWLockRelease(CheckpointLock);
8030 }
8031
8032 /*
8033  * Flush all data in shared memory to disk, and fsync
8034  *
8035  * This is the common code shared between regular checkpoints and
8036  * recovery restartpoints.
8037  */
8038 static void
8039 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8040 {
8041         CheckPointCLOG();
8042         CheckPointSUBTRANS();
8043         CheckPointMultiXact();
8044         CheckPointPredicate();
8045         CheckPointRelationMap();
8046         CheckPointBuffers(flags);       /* performs all required fsyncs */
8047         /* We deliberately delay 2PC checkpointing as long as possible */
8048         CheckPointTwoPhase(checkPointRedo);
8049 }
8050
8051 /*
8052  * Save a checkpoint for recovery restart if appropriate
8053  *
8054  * This function is called each time a checkpoint record is read from XLOG.
8055  * It must determine whether the checkpoint represents a safe restartpoint or
8056  * not.  If so, the checkpoint record is stashed in shared memory so that
8057  * CreateRestartPoint can consult it.  (Note that the latter function is
8058  * executed by the checkpointer, while this one will be executed by the
8059  * startup process.)
8060  */
8061 static void
8062 RecoveryRestartPoint(const CheckPoint *checkPoint)
8063 {
8064         int                     rmid;
8065
8066         /* use volatile pointer to prevent code rearrangement */
8067         volatile XLogCtlData *xlogctl = XLogCtl;
8068
8069         /*
8070          * Is it safe to restartpoint?  We must ask each of the resource managers
8071          * whether they have any partial state information that might prevent a
8072          * correct restart from this point.  If so, we skip this opportunity, but
8073          * return at the next checkpoint record for another try.
8074          */
8075         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
8076         {
8077                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
8078                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
8079                         {
8080                                 elog(trace_recovery(DEBUG2),
8081                                          "RM %d not safe to record restart point at %X/%X",
8082                                          rmid,
8083                                          checkPoint->redo.xlogid,
8084                                          checkPoint->redo.xrecoff);
8085                                 return;
8086                         }
8087         }
8088
8089         /*
8090          * Also refrain from creating a restartpoint if we have seen any
8091          * references to non-existent pages. Restarting recovery from the
8092          * restartpoint would not see the references, so we would lose the
8093          * cross-check that the pages belonged to a relation that was dropped
8094          * later.
8095          */
8096         if (XLogHaveInvalidPages())
8097         {
8098                 elog(trace_recovery(DEBUG2),
8099                          "could not record restart point at %X/%X because there "
8100                          "are unresolved references to invalid pages",
8101                          checkPoint->redo.xlogid,
8102                          checkPoint->redo.xrecoff);
8103                 return;
8104         }
8105
8106         /*
8107          * Copy the checkpoint record to shared memory, so that checkpointer can
8108          * work out the next time it wants to perform a restartpoint.
8109          */
8110         SpinLockAcquire(&xlogctl->info_lck);
8111         XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
8112         memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
8113         SpinLockRelease(&xlogctl->info_lck);
8114 }
8115
8116 /*
8117  * Establish a restartpoint if possible.
8118  *
8119  * This is similar to CreateCheckPoint, but is used during WAL recovery
8120  * to establish a point from which recovery can roll forward without
8121  * replaying the entire recovery log.
8122  *
8123  * Returns true if a new restartpoint was established. We can only establish
8124  * a restartpoint if we have replayed a safe checkpoint record since last
8125  * restartpoint.
8126  */
8127 bool
8128 CreateRestartPoint(int flags)
8129 {
8130         XLogRecPtr      lastCheckPointRecPtr;
8131         CheckPoint      lastCheckPoint;
8132         uint32          _logId;
8133         uint32          _logSeg;
8134         TimestampTz xtime;
8135
8136         /* use volatile pointer to prevent code rearrangement */
8137         volatile XLogCtlData *xlogctl = XLogCtl;
8138
8139         /*
8140          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
8141          * happens at a time.
8142          */
8143         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8144
8145         /* Get a local copy of the last safe checkpoint record. */
8146         SpinLockAcquire(&xlogctl->info_lck);
8147         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
8148         memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
8149         SpinLockRelease(&xlogctl->info_lck);
8150
8151         /*
8152          * Check that we're still in recovery mode. It's ok if we exit recovery
8153          * mode after this check, the restart point is valid anyway.
8154          */
8155         if (!RecoveryInProgress())
8156         {
8157                 ereport(DEBUG2,
8158                           (errmsg("skipping restartpoint, recovery has already ended")));
8159                 LWLockRelease(CheckpointLock);
8160                 return false;
8161         }
8162
8163         /*
8164          * If the last checkpoint record we've replayed is already our last
8165          * restartpoint, we can't perform a new restart point. We still update
8166          * minRecoveryPoint in that case, so that if this is a shutdown restart
8167          * point, we won't start up earlier than before. That's not strictly
8168          * necessary, but when hot standby is enabled, it would be rather weird if
8169          * the database opened up for read-only connections at a point-in-time
8170          * before the last shutdown. Such time travel is still possible in case of
8171          * immediate shutdown, though.
8172          *
8173          * We don't explicitly advance minRecoveryPoint when we do create a
8174          * restartpoint. It's assumed that flushing the buffers will do that as a
8175          * side-effect.
8176          */
8177         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
8178                 XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
8179         {
8180                 ereport(DEBUG2,
8181                                 (errmsg("skipping restartpoint, already performed at %X/%X",
8182                                   lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
8183
8184                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
8185                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8186                 {
8187                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8188                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8189                         ControlFile->time = (pg_time_t) time(NULL);
8190                         UpdateControlFile();
8191                         LWLockRelease(ControlFileLock);
8192                 }
8193                 LWLockRelease(CheckpointLock);
8194                 return false;
8195         }
8196
8197         /*
8198          * Update the shared RedoRecPtr so that the startup process can calculate
8199          * the number of segments replayed since last restartpoint, and request a
8200          * restartpoint if it exceeds checkpoint_segments.
8201          *
8202          * You need to hold WALInsertLock and info_lck to update it, although
8203          * during recovery acquiring WALInsertLock is just pro forma, because
8204          * there is no other processes updating Insert.RedoRecPtr.
8205          */
8206         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8207         SpinLockAcquire(&xlogctl->info_lck);
8208         xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
8209         SpinLockRelease(&xlogctl->info_lck);
8210         LWLockRelease(WALInsertLock);
8211
8212         /*
8213          * Prepare to accumulate statistics.
8214          *
8215          * Note: because it is possible for log_checkpoints to change while a
8216          * checkpoint proceeds, we always accumulate stats, even if
8217          * log_checkpoints is currently off.
8218          */
8219         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8220         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8221
8222         if (log_checkpoints)
8223                 LogCheckpointStart(flags, true);
8224
8225         CheckPointGuts(lastCheckPoint.redo, flags);
8226
8227         /*
8228          * Select point at which we can truncate the xlog, which we base on the
8229          * prior checkpoint's earliest info.
8230          */
8231         XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
8232
8233         /*
8234          * Update pg_control, using current time.  Check that it still shows
8235          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
8236          * this is a quick hack to make sure nothing really bad happens if somehow
8237          * we get here after the end-of-recovery checkpoint.
8238          */
8239         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8240         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
8241                 XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
8242         {
8243                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8244                 ControlFile->checkPoint = lastCheckPointRecPtr;
8245                 ControlFile->checkPointCopy = lastCheckPoint;
8246                 ControlFile->time = (pg_time_t) time(NULL);
8247                 if (flags & CHECKPOINT_IS_SHUTDOWN)
8248                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8249                 UpdateControlFile();
8250         }
8251         LWLockRelease(ControlFileLock);
8252
8253         /*
8254          * Delete old log files (those no longer needed even for previous
8255          * checkpoint/restartpoint) to prevent the disk holding the xlog from
8256          * growing full.
8257          */
8258         if (_logId || _logSeg)
8259         {
8260                 XLogRecPtr      endptr;
8261
8262                 /* Get the current (or recent) end of xlog */
8263                 endptr = GetStandbyFlushRecPtr();
8264
8265                 KeepLogSeg(endptr, &_logId, &_logSeg);
8266                 PrevLogSeg(_logId, _logSeg);
8267                 RemoveOldXlogFiles(_logId, _logSeg, endptr);
8268
8269                 /*
8270                  * Make more log segments if needed.  (Do this after recycling old log
8271                  * segments, since that may supply some of the needed files.)
8272                  */
8273                 PreallocXlogFiles(endptr);
8274         }
8275
8276         /*
8277          * Truncate pg_subtrans if possible.  We can throw away all data before
8278          * the oldest XMIN of any running transaction.  No future transaction will
8279          * attempt to reference any pg_subtrans entry older than that (see Asserts
8280          * in subtrans.c).      When hot standby is disabled, though, we mustn't do
8281          * this because StartupSUBTRANS hasn't been called yet.
8282          */
8283         if (EnableHotStandby)
8284                 TruncateSUBTRANS(GetOldestXmin(true, false));
8285
8286         /* Real work is done, but log and update before releasing lock. */
8287         LogCheckpointEnd(true);
8288
8289         xtime = GetLatestXTime();
8290         ereport((log_checkpoints ? LOG : DEBUG2),
8291                         (errmsg("recovery restart point at %X/%X",
8292                                         lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff),
8293                    xtime ? errdetail("last completed transaction was at log time %s",
8294                                                          timestamptz_to_str(xtime)) : 0));
8295
8296         LWLockRelease(CheckpointLock);
8297
8298         /*
8299          * Finally, execute archive_cleanup_command, if any.
8300          */
8301         if (XLogCtl->archiveCleanupCommand[0])
8302                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
8303                                                            "archive_cleanup_command",
8304                                                            false);
8305
8306         return true;
8307 }
8308
8309 /*
8310  * Calculate the last segment that we need to retain because of
8311  * wal_keep_segments, by subtracting wal_keep_segments from
8312  * the given xlog location, recptr.
8313  */
8314 static void
8315 KeepLogSeg(XLogRecPtr recptr, uint32 *logId, uint32 *logSeg)
8316 {
8317         uint32          log;
8318         uint32          seg;
8319         int                     d_log;
8320         int                     d_seg;
8321
8322         if (wal_keep_segments == 0)
8323                 return;
8324
8325         XLByteToSeg(recptr, log, seg);
8326
8327         d_seg = wal_keep_segments % XLogSegsPerFile;
8328         d_log = wal_keep_segments / XLogSegsPerFile;
8329         if (seg < d_seg)
8330         {
8331                 d_log += 1;
8332                 seg = seg - d_seg + XLogSegsPerFile;
8333         }
8334         else
8335                 seg = seg - d_seg;
8336         /* avoid underflow, don't go below (0,1) */
8337         if (log < d_log || (log == d_log && seg == 0))
8338         {
8339                 log = 0;
8340                 seg = 1;
8341         }
8342         else
8343                 log = log - d_log;
8344
8345         /* don't delete WAL segments newer than the calculated segment */
8346         if (log < *logId || (log == *logId && seg < *logSeg))
8347         {
8348                 *logId = log;
8349                 *logSeg = seg;
8350         }
8351 }
8352
8353 /*
8354  * Write a NEXTOID log record
8355  */
8356 void
8357 XLogPutNextOid(Oid nextOid)
8358 {
8359         XLogRecData rdata;
8360
8361         rdata.data = (char *) (&nextOid);
8362         rdata.len = sizeof(Oid);
8363         rdata.buffer = InvalidBuffer;
8364         rdata.next = NULL;
8365         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
8366
8367         /*
8368          * We need not flush the NEXTOID record immediately, because any of the
8369          * just-allocated OIDs could only reach disk as part of a tuple insert or
8370          * update that would have its own XLOG record that must follow the NEXTOID
8371          * record.      Therefore, the standard buffer LSN interlock applied to those
8372          * records will ensure no such OID reaches disk before the NEXTOID record
8373          * does.
8374          *
8375          * Note, however, that the above statement only covers state "within" the
8376          * database.  When we use a generated OID as a file or directory name, we
8377          * are in a sense violating the basic WAL rule, because that filesystem
8378          * change may reach disk before the NEXTOID WAL record does.  The impact
8379          * of this is that if a database crash occurs immediately afterward, we
8380          * might after restart re-generate the same OID and find that it conflicts
8381          * with the leftover file or directory.  But since for safety's sake we
8382          * always loop until finding a nonconflicting filename, this poses no real
8383          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
8384          */
8385 }
8386
8387 /*
8388  * Write an XLOG SWITCH record.
8389  *
8390  * Here we just blindly issue an XLogInsert request for the record.
8391  * All the magic happens inside XLogInsert.
8392  *
8393  * The return value is either the end+1 address of the switch record,
8394  * or the end+1 address of the prior segment if we did not need to
8395  * write a switch record because we are already at segment start.
8396  */
8397 XLogRecPtr
8398 RequestXLogSwitch(void)
8399 {
8400         XLogRecPtr      RecPtr;
8401         XLogRecData rdata;
8402
8403         /* XLOG SWITCH, alone among xlog record types, has no data */
8404         rdata.buffer = InvalidBuffer;
8405         rdata.data = NULL;
8406         rdata.len = 0;
8407         rdata.next = NULL;
8408
8409         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
8410
8411         return RecPtr;
8412 }
8413
8414 /*
8415  * Write a RESTORE POINT record
8416  */
8417 XLogRecPtr
8418 XLogRestorePoint(const char *rpName)
8419 {
8420         XLogRecPtr      RecPtr;
8421         XLogRecData rdata;
8422         xl_restore_point xlrec;
8423
8424         xlrec.rp_time = GetCurrentTimestamp();
8425         strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
8426
8427         rdata.buffer = InvalidBuffer;
8428         rdata.data = (char *) &xlrec;
8429         rdata.len = sizeof(xl_restore_point);
8430         rdata.next = NULL;
8431
8432         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
8433
8434         ereport(LOG,
8435                         (errmsg("restore point \"%s\" created at %X/%X",
8436                                         rpName, RecPtr.xlogid, RecPtr.xrecoff)));
8437
8438         return RecPtr;
8439 }
8440
8441 /*
8442  * Check if any of the GUC parameters that are critical for hot standby
8443  * have changed, and update the value in pg_control file if necessary.
8444  */
8445 static void
8446 XLogReportParameters(void)
8447 {
8448         if (wal_level != ControlFile->wal_level ||
8449                 MaxConnections != ControlFile->MaxConnections ||
8450                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
8451                 max_locks_per_xact != ControlFile->max_locks_per_xact)
8452         {
8453                 /*
8454                  * The change in number of backend slots doesn't need to be WAL-logged
8455                  * if archiving is not enabled, as you can't start archive recovery
8456                  * with wal_level=minimal anyway. We don't really care about the
8457                  * values in pg_control either if wal_level=minimal, but seems better
8458                  * to keep them up-to-date to avoid confusion.
8459                  */
8460                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
8461                 {
8462                         XLogRecData rdata;
8463                         xl_parameter_change xlrec;
8464
8465                         xlrec.MaxConnections = MaxConnections;
8466                         xlrec.max_prepared_xacts = max_prepared_xacts;
8467                         xlrec.max_locks_per_xact = max_locks_per_xact;
8468                         xlrec.wal_level = wal_level;
8469
8470                         rdata.buffer = InvalidBuffer;
8471                         rdata.data = (char *) &xlrec;
8472                         rdata.len = sizeof(xlrec);
8473                         rdata.next = NULL;
8474
8475                         XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
8476                 }
8477
8478                 ControlFile->MaxConnections = MaxConnections;
8479                 ControlFile->max_prepared_xacts = max_prepared_xacts;
8480                 ControlFile->max_locks_per_xact = max_locks_per_xact;
8481                 ControlFile->wal_level = wal_level;
8482                 UpdateControlFile();
8483         }
8484 }
8485
8486 /*
8487  * Update full_page_writes in shared memory, and write an
8488  * XLOG_FPW_CHANGE record if necessary.
8489  *
8490  * Note: this function assumes there is no other process running
8491  * concurrently that could update it.
8492  */
8493 void
8494 UpdateFullPageWrites(void)
8495 {
8496         XLogCtlInsert *Insert = &XLogCtl->Insert;
8497
8498         /*
8499          * Do nothing if full_page_writes has not been changed.
8500          *
8501          * It's safe to check the shared full_page_writes without the lock,
8502          * because we assume that there is no concurrently running process which
8503          * can update it.
8504          */
8505         if (fullPageWrites == Insert->fullPageWrites)
8506                 return;
8507
8508         START_CRIT_SECTION();
8509
8510         /*
8511          * It's always safe to take full page images, even when not strictly
8512          * required, but not the other round. So if we're setting full_page_writes
8513          * to true, first set it true and then write the WAL record. If we're
8514          * setting it to false, first write the WAL record and then set the global
8515          * flag.
8516          */
8517         if (fullPageWrites)
8518         {
8519                 LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8520                 Insert->fullPageWrites = true;
8521                 LWLockRelease(WALInsertLock);
8522         }
8523
8524         /*
8525          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
8526          * full_page_writes during archive recovery, if required.
8527          */
8528         if (XLogStandbyInfoActive() && !RecoveryInProgress())
8529         {
8530                 XLogRecData rdata;
8531
8532                 rdata.data = (char *) (&fullPageWrites);
8533                 rdata.len = sizeof(bool);
8534                 rdata.buffer = InvalidBuffer;
8535                 rdata.next = NULL;
8536
8537                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
8538         }
8539
8540         if (!fullPageWrites)
8541         {
8542                 LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8543                 Insert->fullPageWrites = false;
8544                 LWLockRelease(WALInsertLock);
8545         }
8546         END_CRIT_SECTION();
8547 }
8548
8549 /*
8550  * XLOG resource manager's routines
8551  *
8552  * Definitions of info values are in include/catalog/pg_control.h, though
8553  * not all record types are related to control file updates.
8554  */
8555 void
8556 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
8557 {
8558         uint8           info = record->xl_info & ~XLR_INFO_MASK;
8559
8560         /* Backup blocks are not used in xlog records */
8561         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
8562
8563         if (info == XLOG_NEXTOID)
8564         {
8565                 Oid                     nextOid;
8566
8567                 /*
8568                  * We used to try to take the maximum of ShmemVariableCache->nextOid
8569                  * and the recorded nextOid, but that fails if the OID counter wraps
8570                  * around.      Since no OID allocation should be happening during replay
8571                  * anyway, better to just believe the record exactly.  We still take
8572                  * OidGenLock while setting the variable, just in case.
8573                  */
8574                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
8575                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8576                 ShmemVariableCache->nextOid = nextOid;
8577                 ShmemVariableCache->oidCount = 0;
8578                 LWLockRelease(OidGenLock);
8579         }
8580         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
8581         {
8582                 CheckPoint      checkPoint;
8583
8584                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8585                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
8586                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8587                 ShmemVariableCache->nextXid = checkPoint.nextXid;
8588                 LWLockRelease(XidGenLock);
8589                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8590                 ShmemVariableCache->nextOid = checkPoint.nextOid;
8591                 ShmemVariableCache->oidCount = 0;
8592                 LWLockRelease(OidGenLock);
8593                 MultiXactSetNextMXact(checkPoint.nextMulti,
8594                                                           checkPoint.nextMultiOffset);
8595                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
8596
8597                 /*
8598                  * If we see a shutdown checkpoint while waiting for an end-of-backup
8599                  * record, the backup was canceled and the end-of-backup record will
8600                  * never arrive.
8601                  */
8602                 if (InArchiveRecovery &&
8603                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
8604                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
8605                         ereport(PANIC,
8606                         (errmsg("online backup was canceled, recovery cannot continue")));
8607
8608                 /*
8609                  * If we see a shutdown checkpoint, we know that nothing was running
8610                  * on the master at this point. So fake-up an empty running-xacts
8611                  * record and use that here and now. Recover additional standby state
8612                  * for prepared transactions.
8613                  */
8614                 if (standbyState >= STANDBY_INITIALIZED)
8615                 {
8616                         TransactionId *xids;
8617                         int                     nxids;
8618                         TransactionId oldestActiveXID;
8619                         TransactionId latestCompletedXid;
8620                         RunningTransactionsData running;
8621
8622                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
8623
8624                         /*
8625                          * Construct a RunningTransactions snapshot representing a shut
8626                          * down server, with only prepared transactions still alive. We're
8627                          * never overflowed at this point because all subxids are listed
8628                          * with their parent prepared transactions.
8629                          */
8630                         running.xcnt = nxids;
8631                         running.subxid_overflow = false;
8632                         running.nextXid = checkPoint.nextXid;
8633                         running.oldestRunningXid = oldestActiveXID;
8634                         latestCompletedXid = checkPoint.nextXid;
8635                         TransactionIdRetreat(latestCompletedXid);
8636                         Assert(TransactionIdIsNormal(latestCompletedXid));
8637                         running.latestCompletedXid = latestCompletedXid;
8638                         running.xids = xids;
8639
8640                         ProcArrayApplyRecoveryInfo(&running);
8641
8642                         StandbyRecoverPreparedTransactions(true);
8643                 }
8644
8645                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8646                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
8647                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8648
8649                 /* Update shared-memory copy of checkpoint XID/epoch */
8650                 {
8651                         /* use volatile pointer to prevent code rearrangement */
8652                         volatile XLogCtlData *xlogctl = XLogCtl;
8653
8654                         SpinLockAcquire(&xlogctl->info_lck);
8655                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
8656                         xlogctl->ckptXid = checkPoint.nextXid;
8657                         SpinLockRelease(&xlogctl->info_lck);
8658                 }
8659
8660                 /*
8661                  * TLI may change in a shutdown checkpoint, but it shouldn't decrease
8662                  */
8663                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8664                 {
8665                         if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
8666                                 !list_member_int(expectedTLIs,
8667                                                                  (int) checkPoint.ThisTimeLineID))
8668                                 ereport(PANIC,
8669                                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
8670                                                                 checkPoint.ThisTimeLineID, ThisTimeLineID)));
8671                         /* Following WAL records should be run with new TLI */
8672                         ThisTimeLineID = checkPoint.ThisTimeLineID;
8673                 }
8674
8675                 RecoveryRestartPoint(&checkPoint);
8676         }
8677         else if (info == XLOG_CHECKPOINT_ONLINE)
8678         {
8679                 CheckPoint      checkPoint;
8680
8681                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8682                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
8683                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8684                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
8685                                                                   checkPoint.nextXid))
8686                         ShmemVariableCache->nextXid = checkPoint.nextXid;
8687                 LWLockRelease(XidGenLock);
8688                 /* ... but still treat OID counter as exact */
8689                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8690                 ShmemVariableCache->nextOid = checkPoint.nextOid;
8691                 ShmemVariableCache->oidCount = 0;
8692                 LWLockRelease(OidGenLock);
8693                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
8694                                                                   checkPoint.nextMultiOffset);
8695                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
8696                                                                   checkPoint.oldestXid))
8697                         SetTransactionIdLimit(checkPoint.oldestXid,
8698                                                                   checkPoint.oldestXidDB);
8699
8700                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8701                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
8702                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8703
8704                 /* Update shared-memory copy of checkpoint XID/epoch */
8705                 {
8706                         /* use volatile pointer to prevent code rearrangement */
8707                         volatile XLogCtlData *xlogctl = XLogCtl;
8708
8709                         SpinLockAcquire(&xlogctl->info_lck);
8710                         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
8711                         xlogctl->ckptXid = checkPoint.nextXid;
8712                         SpinLockRelease(&xlogctl->info_lck);
8713                 }
8714
8715                 /* TLI should not change in an on-line checkpoint */
8716                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8717                         ereport(PANIC,
8718                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8719                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
8720
8721                 RecoveryRestartPoint(&checkPoint);
8722         }
8723         else if (info == XLOG_NOOP)
8724         {
8725                 /* nothing to do here */
8726         }
8727         else if (info == XLOG_SWITCH)
8728         {
8729                 /* nothing to do here */
8730         }
8731         else if (info == XLOG_RESTORE_POINT)
8732         {
8733                 /* nothing to do here */
8734         }
8735         else if (info == XLOG_BACKUP_END)
8736         {
8737                 XLogRecPtr      startpoint;
8738
8739                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
8740
8741                 if (XLByteEQ(ControlFile->backupStartPoint, startpoint))
8742                 {
8743                         /*
8744                          * We have reached the end of base backup, the point where
8745                          * pg_stop_backup() was done. The data on disk is now consistent.
8746                          * Reset backupStartPoint, and update minRecoveryPoint to make
8747                          * sure we don't allow starting up at an earlier point even if
8748                          * recovery is stopped and restarted soon after this.
8749                          */
8750                         elog(DEBUG1, "end of backup reached");
8751
8752                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8753
8754                         if (XLByteLT(ControlFile->minRecoveryPoint, lsn))
8755                                 ControlFile->minRecoveryPoint = lsn;
8756                         MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
8757                         ControlFile->backupEndRequired = false;
8758                         UpdateControlFile();
8759
8760                         LWLockRelease(ControlFileLock);
8761                 }
8762         }
8763         else if (info == XLOG_PARAMETER_CHANGE)
8764         {
8765                 xl_parameter_change xlrec;
8766
8767                 /* Update our copy of the parameters in pg_control */
8768                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
8769
8770                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8771                 ControlFile->MaxConnections = xlrec.MaxConnections;
8772                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
8773                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
8774                 ControlFile->wal_level = xlrec.wal_level;
8775
8776                 /*
8777                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
8778                  * recover back up to this point before allowing hot standby again.
8779                  * This is particularly important if wal_level was set to 'archive'
8780                  * before, and is now 'hot_standby', to ensure you don't run queries
8781                  * against the WAL preceding the wal_level change. Same applies to
8782                  * decreasing max_* settings.
8783                  */
8784                 minRecoveryPoint = ControlFile->minRecoveryPoint;
8785                 if ((minRecoveryPoint.xlogid != 0 || minRecoveryPoint.xrecoff != 0)
8786                         && XLByteLT(minRecoveryPoint, lsn))
8787                 {
8788                         ControlFile->minRecoveryPoint = lsn;
8789                 }
8790
8791                 UpdateControlFile();
8792                 LWLockRelease(ControlFileLock);
8793
8794                 /* Check to see if any changes to max_connections give problems */
8795                 CheckRequiredParameterValues();
8796         }
8797         else if (info == XLOG_FPW_CHANGE)
8798         {
8799                 /* use volatile pointer to prevent code rearrangement */
8800                 volatile XLogCtlData *xlogctl = XLogCtl;
8801                 bool            fpw;
8802
8803                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
8804
8805                 /*
8806                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
8807                  * do_pg_start_backup() and do_pg_stop_backup() can check whether
8808                  * full_page_writes has been disabled during online backup.
8809                  */
8810                 if (!fpw)
8811                 {
8812                         SpinLockAcquire(&xlogctl->info_lck);
8813                         if (XLByteLT(xlogctl->lastFpwDisableRecPtr, ReadRecPtr))
8814                                 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
8815                         SpinLockRelease(&xlogctl->info_lck);
8816                 }
8817
8818                 /* Keep track of full_page_writes */
8819                 lastFullPageWrites = fpw;
8820         }
8821 }
8822
8823 void
8824 xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
8825 {
8826         uint8           info = xl_info & ~XLR_INFO_MASK;
8827
8828         if (info == XLOG_CHECKPOINT_SHUTDOWN ||
8829                 info == XLOG_CHECKPOINT_ONLINE)
8830         {
8831                 CheckPoint *checkpoint = (CheckPoint *) rec;
8832
8833                 appendStringInfo(buf, "checkpoint: redo %X/%X; "
8834                                    "tli %u; fpw %s; xid %u/%u; oid %u; multi %u; offset %u; "
8835                                                  "oldest xid %u in DB %u; oldest running xid %u; %s",
8836                                                  checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
8837                                                  checkpoint->ThisTimeLineID,
8838                                                  checkpoint->fullPageWrites ? "true" : "false",
8839                                                  checkpoint->nextXidEpoch, checkpoint->nextXid,
8840                                                  checkpoint->nextOid,
8841                                                  checkpoint->nextMulti,
8842                                                  checkpoint->nextMultiOffset,
8843                                                  checkpoint->oldestXid,
8844                                                  checkpoint->oldestXidDB,
8845                                                  checkpoint->oldestActiveXid,
8846                                  (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
8847         }
8848         else if (info == XLOG_NOOP)
8849         {
8850                 appendStringInfo(buf, "xlog no-op");
8851         }
8852         else if (info == XLOG_NEXTOID)
8853         {
8854                 Oid                     nextOid;
8855
8856                 memcpy(&nextOid, rec, sizeof(Oid));
8857                 appendStringInfo(buf, "nextOid: %u", nextOid);
8858         }
8859         else if (info == XLOG_SWITCH)
8860         {
8861                 appendStringInfo(buf, "xlog switch");
8862         }
8863         else if (info == XLOG_RESTORE_POINT)
8864         {
8865                 xl_restore_point *xlrec = (xl_restore_point *) rec;
8866
8867                 appendStringInfo(buf, "restore point: %s", xlrec->rp_name);
8868
8869         }
8870         else if (info == XLOG_BACKUP_END)
8871         {
8872                 XLogRecPtr      startpoint;
8873
8874                 memcpy(&startpoint, rec, sizeof(XLogRecPtr));
8875                 appendStringInfo(buf, "backup end: %X/%X",
8876                                                  startpoint.xlogid, startpoint.xrecoff);
8877         }
8878         else if (info == XLOG_PARAMETER_CHANGE)
8879         {
8880                 xl_parameter_change xlrec;
8881                 const char *wal_level_str;
8882                 const struct config_enum_entry *entry;
8883
8884                 memcpy(&xlrec, rec, sizeof(xl_parameter_change));
8885
8886                 /* Find a string representation for wal_level */
8887                 wal_level_str = "?";
8888                 for (entry = wal_level_options; entry->name; entry++)
8889                 {
8890                         if (entry->val == xlrec.wal_level)
8891                         {
8892                                 wal_level_str = entry->name;
8893                                 break;
8894                         }
8895                 }
8896
8897                 appendStringInfo(buf, "parameter change: max_connections=%d max_prepared_xacts=%d max_locks_per_xact=%d wal_level=%s",
8898                                                  xlrec.MaxConnections,
8899                                                  xlrec.max_prepared_xacts,
8900                                                  xlrec.max_locks_per_xact,
8901                                                  wal_level_str);
8902         }
8903         else if (info == XLOG_FPW_CHANGE)
8904         {
8905                 bool            fpw;
8906
8907                 memcpy(&fpw, rec, sizeof(bool));
8908                 appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false");
8909         }
8910         else
8911                 appendStringInfo(buf, "UNKNOWN");
8912 }
8913
8914 #ifdef WAL_DEBUG
8915
8916 static void
8917 xlog_outrec(StringInfo buf, XLogRecord *record)
8918 {
8919         int                     i;
8920
8921         appendStringInfo(buf, "prev %X/%X; xid %u",
8922                                          record->xl_prev.xlogid, record->xl_prev.xrecoff,
8923                                          record->xl_xid);
8924
8925         appendStringInfo(buf, "; len %u",
8926                                          record->xl_len);
8927
8928         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
8929         {
8930                 if (record->xl_info & XLR_SET_BKP_BLOCK(i))
8931                         appendStringInfo(buf, "; bkpb%d", i + 1);
8932         }
8933
8934         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
8935 }
8936 #endif   /* WAL_DEBUG */
8937
8938
8939 /*
8940  * Return the (possible) sync flag used for opening a file, depending on the
8941  * value of the GUC wal_sync_method.
8942  */
8943 static int
8944 get_sync_bit(int method)
8945 {
8946         int                     o_direct_flag = 0;
8947
8948         /* If fsync is disabled, never open in sync mode */
8949         if (!enableFsync)
8950                 return 0;
8951
8952         /*
8953          * Optimize writes by bypassing kernel cache with O_DIRECT when using
8954          * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
8955          * disabled, otherwise the archive command or walsender process will read
8956          * the WAL soon after writing it, which is guaranteed to cause a physical
8957          * read if we bypassed the kernel cache. We also skip the
8958          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
8959          * reason.
8960          *
8961          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
8962          * written by walreceiver is normally read by the startup process soon
8963          * after its written. Also, walreceiver performs unaligned writes, which
8964          * don't work with O_DIRECT, so it is required for correctness too.
8965          */
8966         if (!XLogIsNeeded() && !am_walreceiver)
8967                 o_direct_flag = PG_O_DIRECT;
8968
8969         switch (method)
8970         {
8971                         /*
8972                          * enum values for all sync options are defined even if they are
8973                          * not supported on the current platform.  But if not, they are
8974                          * not included in the enum option array, and therefore will never
8975                          * be seen here.
8976                          */
8977                 case SYNC_METHOD_FSYNC:
8978                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
8979                 case SYNC_METHOD_FDATASYNC:
8980                         return 0;
8981 #ifdef OPEN_SYNC_FLAG
8982                 case SYNC_METHOD_OPEN:
8983                         return OPEN_SYNC_FLAG | o_direct_flag;
8984 #endif
8985 #ifdef OPEN_DATASYNC_FLAG
8986                 case SYNC_METHOD_OPEN_DSYNC:
8987                         return OPEN_DATASYNC_FLAG | o_direct_flag;
8988 #endif
8989                 default:
8990                         /* can't happen (unless we are out of sync with option array) */
8991                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
8992                         return 0;                       /* silence warning */
8993         }
8994 }
8995
8996 /*
8997  * GUC support
8998  */
8999 void
9000 assign_xlog_sync_method(int new_sync_method, void *extra)
9001 {
9002         if (sync_method != new_sync_method)
9003         {
9004                 /*
9005                  * To ensure that no blocks escape unsynced, force an fsync on the
9006                  * currently open log segment (if any).  Also, if the open flag is
9007                  * changing, close the log file so it will be reopened (with new flag
9008                  * bit) at next use.
9009                  */
9010                 if (openLogFile >= 0)
9011                 {
9012                         if (pg_fsync(openLogFile) != 0)
9013                                 ereport(PANIC,
9014                                                 (errcode_for_file_access(),
9015                                                  errmsg("could not fsync log file %u, segment %u: %m",
9016                                                                 openLogId, openLogSeg)));
9017                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
9018                                 XLogFileClose();
9019                 }
9020         }
9021 }
9022
9023
9024 /*
9025  * Issue appropriate kind of fsync (if any) for an XLOG output file.
9026  *
9027  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
9028  * 'log' and 'seg' are for error reporting purposes.
9029  */
9030 void
9031 issue_xlog_fsync(int fd, uint32 log, uint32 seg)
9032 {
9033         switch (sync_method)
9034         {
9035                 case SYNC_METHOD_FSYNC:
9036                         if (pg_fsync_no_writethrough(fd) != 0)
9037                                 ereport(PANIC,
9038                                                 (errcode_for_file_access(),
9039                                                  errmsg("could not fsync log file %u, segment %u: %m",
9040                                                                 log, seg)));
9041                         break;
9042 #ifdef HAVE_FSYNC_WRITETHROUGH
9043                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
9044                         if (pg_fsync_writethrough(fd) != 0)
9045                                 ereport(PANIC,
9046                                                 (errcode_for_file_access(),
9047                                                  errmsg("could not fsync write-through log file %u, segment %u: %m",
9048                                                                 log, seg)));
9049                         break;
9050 #endif
9051 #ifdef HAVE_FDATASYNC
9052                 case SYNC_METHOD_FDATASYNC:
9053                         if (pg_fdatasync(fd) != 0)
9054                                 ereport(PANIC,
9055                                                 (errcode_for_file_access(),
9056                                         errmsg("could not fdatasync log file %u, segment %u: %m",
9057                                                    log, seg)));
9058                         break;
9059 #endif
9060                 case SYNC_METHOD_OPEN:
9061                 case SYNC_METHOD_OPEN_DSYNC:
9062                         /* write synced it already */
9063                         break;
9064                 default:
9065                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
9066                         break;
9067         }
9068 }
9069
9070 /*
9071  * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
9072  * function. It creates the necessary starting checkpoint and constructs the
9073  * backup label file.
9074  *
9075  * There are two kind of backups: exclusive and non-exclusive. An exclusive
9076  * backup is started with pg_start_backup(), and there can be only one active
9077  * at a time. The backup label file of an exclusive backup is written to
9078  * $PGDATA/backup_label, and it is removed by pg_stop_backup().
9079  *
9080  * A non-exclusive backup is used for the streaming base backups (see
9081  * src/backend/replication/basebackup.c). The difference to exclusive backups
9082  * is that the backup label file is not written to disk. Instead, its would-be
9083  * contents are returned in *labelfile, and the caller is responsible for
9084  * including it in the backup archive as 'backup_label'. There can be many
9085  * non-exclusive backups active at the same time, and they don't conflict
9086  * with an exclusive backup either.
9087  *
9088  * Every successfully started non-exclusive backup must be stopped by calling
9089  * do_pg_stop_backup() or do_pg_abort_backup().
9090  */
9091 XLogRecPtr
9092 do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
9093 {
9094         bool            exclusive = (labelfile == NULL);
9095         bool            backup_started_in_recovery = false;
9096         XLogRecPtr      checkpointloc;
9097         XLogRecPtr      startpoint;
9098         pg_time_t       stamp_time;
9099         char            strfbuf[128];
9100         char            xlogfilename[MAXFNAMELEN];
9101         uint32          _logId;
9102         uint32          _logSeg;
9103         struct stat stat_buf;
9104         FILE       *fp;
9105         StringInfoData labelfbuf;
9106
9107         backup_started_in_recovery = RecoveryInProgress();
9108
9109         if (!superuser() && !is_authenticated_user_replication_role())
9110                 ereport(ERROR,
9111                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
9112                    errmsg("must be superuser or replication role to run a backup")));
9113
9114         /*
9115          * Currently only non-exclusive backup can be taken during recovery.
9116          */
9117         if (backup_started_in_recovery && exclusive)
9118                 ereport(ERROR,
9119                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9120                                  errmsg("recovery is in progress"),
9121                                  errhint("WAL control functions cannot be executed during recovery.")));
9122
9123         /*
9124          * During recovery, we don't need to check WAL level. Because, if WAL
9125          * level is not sufficient, it's impossible to get here during recovery.
9126          */
9127         if (!backup_started_in_recovery && !XLogIsNeeded())
9128                 ereport(ERROR,
9129                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9130                           errmsg("WAL level not sufficient for making an online backup"),
9131                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
9132
9133         if (strlen(backupidstr) > MAXPGPATH)
9134                 ereport(ERROR,
9135                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
9136                                  errmsg("backup label too long (max %d bytes)",
9137                                                 MAXPGPATH)));
9138
9139         /*
9140          * Mark backup active in shared memory.  We must do full-page WAL writes
9141          * during an on-line backup even if not doing so at other times, because
9142          * it's quite possible for the backup dump to obtain a "torn" (partially
9143          * written) copy of a database page if it reads the page concurrently with
9144          * our write to the same page.  This can be fixed as long as the first
9145          * write to the page in the WAL sequence is a full-page write. Hence, we
9146          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
9147          * are no dirty pages in shared memory that might get dumped while the
9148          * backup is in progress without having a corresponding WAL record.  (Once
9149          * the backup is complete, we need not force full-page writes anymore,
9150          * since we expect that any pages not modified during the backup interval
9151          * must have been correctly captured by the backup.)
9152          *
9153          * Note that forcePageWrites has no effect during an online backup from
9154          * the standby.
9155          *
9156          * We must hold WALInsertLock to change the value of forcePageWrites, to
9157          * ensure adequate interlocking against XLogInsert().
9158          */
9159         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9160         if (exclusive)
9161         {
9162                 if (XLogCtl->Insert.exclusiveBackup)
9163                 {
9164                         LWLockRelease(WALInsertLock);
9165                         ereport(ERROR,
9166                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9167                                          errmsg("a backup is already in progress"),
9168                                          errhint("Run pg_stop_backup() and try again.")));
9169                 }
9170                 XLogCtl->Insert.exclusiveBackup = true;
9171         }
9172         else
9173                 XLogCtl->Insert.nonExclusiveBackups++;
9174         XLogCtl->Insert.forcePageWrites = true;
9175         LWLockRelease(WALInsertLock);
9176
9177         /* Ensure we release forcePageWrites if fail below */
9178         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9179         {
9180                 bool            gotUniqueStartpoint = false;
9181
9182                 /*
9183                  * Force an XLOG file switch before the checkpoint, to ensure that the
9184                  * WAL segment the checkpoint is written to doesn't contain pages with
9185                  * old timeline IDs.  That would otherwise happen if you called
9186                  * pg_start_backup() right after restoring from a PITR archive: the
9187                  * first WAL segment containing the startup checkpoint has pages in
9188                  * the beginning with the old timeline ID.      That can cause trouble at
9189                  * recovery: we won't have a history file covering the old timeline if
9190                  * pg_xlog directory was not included in the base backup and the WAL
9191                  * archive was cleared too before starting the backup.
9192                  *
9193                  * This also ensures that we have emitted a WAL page header that has
9194                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
9195                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
9196                  * compress out removable backup blocks, it won't remove any that
9197                  * occur after this point.
9198                  *
9199                  * During recovery, we skip forcing XLOG file switch, which means that
9200                  * the backup taken during recovery is not available for the special
9201                  * recovery case described above.
9202                  */
9203                 if (!backup_started_in_recovery)
9204                         RequestXLogSwitch();
9205
9206                 do
9207                 {
9208                         bool            checkpointfpw;
9209
9210                         /*
9211                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
9212                          * page problems, this guarantees that two successive backup runs
9213                          * will have different checkpoint positions and hence different
9214                          * history file names, even if nothing happened in between.
9215                          *
9216                          * During recovery, establish a restartpoint if possible. We use
9217                          * the last restartpoint as the backup starting checkpoint. This
9218                          * means that two successive backup runs can have same checkpoint
9219                          * positions.
9220                          *
9221                          * Since the fact that we are executing do_pg_start_backup()
9222                          * during recovery means that checkpointer is running, we can use
9223                          * RequestCheckpoint() to establish a restartpoint.
9224                          *
9225                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
9226                          * passing fast = true).  Otherwise this can take awhile.
9227                          */
9228                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
9229                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
9230
9231                         /*
9232                          * Now we need to fetch the checkpoint record location, and also
9233                          * its REDO pointer.  The oldest point in WAL that would be needed
9234                          * to restore starting from the checkpoint is precisely the REDO
9235                          * pointer.
9236                          */
9237                         LWLockAcquire(ControlFileLock, LW_SHARED);
9238                         checkpointloc = ControlFile->checkPoint;
9239                         startpoint = ControlFile->checkPointCopy.redo;
9240                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
9241                         LWLockRelease(ControlFileLock);
9242
9243                         if (backup_started_in_recovery)
9244                         {
9245                                 /* use volatile pointer to prevent code rearrangement */
9246                                 volatile XLogCtlData *xlogctl = XLogCtl;
9247                                 XLogRecPtr      recptr;
9248
9249                                 /*
9250                                  * Check to see if all WAL replayed during online backup
9251                                  * (i.e., since last restartpoint used as backup starting
9252                                  * checkpoint) contain full-page writes.
9253                                  */
9254                                 SpinLockAcquire(&xlogctl->info_lck);
9255                                 recptr = xlogctl->lastFpwDisableRecPtr;
9256                                 SpinLockRelease(&xlogctl->info_lck);
9257
9258                                 if (!checkpointfpw || XLByteLE(startpoint, recptr))
9259                                         ereport(ERROR,
9260                                                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9261                                                    errmsg("WAL generated with full_page_writes=off was replayed "
9262                                                                   "since last restartpoint"),
9263                                                    errhint("This means that the backup being taken on standby "
9264                                                                    "is corrupt and should not be used. "
9265                                                                    "Enable full_page_writes and run CHECKPOINT on the master, "
9266                                                                    "and then try an online backup again.")));
9267
9268                                 /*
9269                                  * During recovery, since we don't use the end-of-backup WAL
9270                                  * record and don't write the backup history file, the
9271                                  * starting WAL location doesn't need to be unique. This means
9272                                  * that two base backups started at the same time might use
9273                                  * the same checkpoint as starting locations.
9274                                  */
9275                                 gotUniqueStartpoint = true;
9276                         }
9277
9278                         /*
9279                          * If two base backups are started at the same time (in WAL sender
9280                          * processes), we need to make sure that they use different
9281                          * checkpoints as starting locations, because we use the starting
9282                          * WAL location as a unique identifier for the base backup in the
9283                          * end-of-backup WAL record and when we write the backup history
9284                          * file. Perhaps it would be better generate a separate unique ID
9285                          * for each backup instead of forcing another checkpoint, but
9286                          * taking a checkpoint right after another is not that expensive
9287                          * either because only few buffers have been dirtied yet.
9288                          */
9289                         LWLockAcquire(WALInsertLock, LW_SHARED);
9290                         if (XLByteLT(XLogCtl->Insert.lastBackupStart, startpoint))
9291                         {
9292                                 XLogCtl->Insert.lastBackupStart = startpoint;
9293                                 gotUniqueStartpoint = true;
9294                         }
9295                         LWLockRelease(WALInsertLock);
9296                 } while (!gotUniqueStartpoint);
9297
9298                 XLByteToSeg(startpoint, _logId, _logSeg);
9299                 XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
9300
9301                 /*
9302                  * Construct backup label file
9303                  */
9304                 initStringInfo(&labelfbuf);
9305
9306                 /* Use the log timezone here, not the session timezone */
9307                 stamp_time = (pg_time_t) time(NULL);
9308                 pg_strftime(strfbuf, sizeof(strfbuf),
9309                                         "%Y-%m-%d %H:%M:%S %Z",
9310                                         pg_localtime(&stamp_time, log_timezone));
9311                 appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
9312                                                  startpoint.xlogid, startpoint.xrecoff, xlogfilename);
9313                 appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
9314                                                  checkpointloc.xlogid, checkpointloc.xrecoff);
9315                 appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
9316                                                  exclusive ? "pg_start_backup" : "streamed");
9317                 appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
9318                                                  backup_started_in_recovery ? "standby" : "master");
9319                 appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
9320                 appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
9321
9322                 /*
9323                  * Okay, write the file, or return its contents to caller.
9324                  */
9325                 if (exclusive)
9326                 {
9327                         /*
9328                          * Check for existing backup label --- implies a backup is already
9329                          * running.  (XXX given that we checked exclusiveBackup above,
9330                          * maybe it would be OK to just unlink any such label file?)
9331                          */
9332                         if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
9333                         {
9334                                 if (errno != ENOENT)
9335                                         ereport(ERROR,
9336                                                         (errcode_for_file_access(),
9337                                                          errmsg("could not stat file \"%s\": %m",
9338                                                                         BACKUP_LABEL_FILE)));
9339                         }
9340                         else
9341                                 ereport(ERROR,
9342                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9343                                                  errmsg("a backup is already in progress"),
9344                                                  errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
9345                                                                  BACKUP_LABEL_FILE)));
9346
9347                         fp = AllocateFile(BACKUP_LABEL_FILE, "w");
9348
9349                         if (!fp)
9350                                 ereport(ERROR,
9351                                                 (errcode_for_file_access(),
9352                                                  errmsg("could not create file \"%s\": %m",
9353                                                                 BACKUP_LABEL_FILE)));
9354                         if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
9355                                 fflush(fp) != 0 ||
9356                                 ferror(fp) ||
9357                                 FreeFile(fp))
9358                                 ereport(ERROR,
9359                                                 (errcode_for_file_access(),
9360                                                  errmsg("could not write file \"%s\": %m",
9361                                                                 BACKUP_LABEL_FILE)));
9362                         pfree(labelfbuf.data);
9363                 }
9364                 else
9365                         *labelfile = labelfbuf.data;
9366         }
9367         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
9368
9369         /*
9370          * We're done.  As a convenience, return the starting WAL location.
9371          */
9372         return startpoint;
9373 }
9374
9375 /* Error cleanup callback for pg_start_backup */
9376 static void
9377 pg_start_backup_callback(int code, Datum arg)
9378 {
9379         bool            exclusive = DatumGetBool(arg);
9380
9381         /* Update backup counters and forcePageWrites on failure */
9382         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9383         if (exclusive)
9384         {
9385                 Assert(XLogCtl->Insert.exclusiveBackup);
9386                 XLogCtl->Insert.exclusiveBackup = false;
9387         }
9388         else
9389         {
9390                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9391                 XLogCtl->Insert.nonExclusiveBackups--;
9392         }
9393
9394         if (!XLogCtl->Insert.exclusiveBackup &&
9395                 XLogCtl->Insert.nonExclusiveBackups == 0)
9396         {
9397                 XLogCtl->Insert.forcePageWrites = false;
9398         }
9399         LWLockRelease(WALInsertLock);
9400 }
9401
9402 /*
9403  * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
9404  * function.
9405
9406  * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
9407  * the non-exclusive backup specified by 'labelfile'.
9408  */
9409 XLogRecPtr
9410 do_pg_stop_backup(char *labelfile, bool waitforarchive)
9411 {
9412         bool            exclusive = (labelfile == NULL);
9413         bool            backup_started_in_recovery = false;
9414         XLogRecPtr      startpoint;
9415         XLogRecPtr      stoppoint;
9416         XLogRecData rdata;
9417         pg_time_t       stamp_time;
9418         char            strfbuf[128];
9419         char            histfilepath[MAXPGPATH];
9420         char            startxlogfilename[MAXFNAMELEN];
9421         char            stopxlogfilename[MAXFNAMELEN];
9422         char            lastxlogfilename[MAXFNAMELEN];
9423         char            histfilename[MAXFNAMELEN];
9424         char            backupfrom[20];
9425         uint32          _logId;
9426         uint32          _logSeg;
9427         FILE       *lfp;
9428         FILE       *fp;
9429         char            ch;
9430         int                     seconds_before_warning;
9431         int                     waits = 0;
9432         bool            reported_waiting = false;
9433         char       *remaining;
9434         char       *ptr;
9435
9436         backup_started_in_recovery = RecoveryInProgress();
9437
9438         if (!superuser() && !is_authenticated_user_replication_role())
9439                 ereport(ERROR,
9440                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
9441                  (errmsg("must be superuser or replication role to run a backup"))));
9442
9443         /*
9444          * Currently only non-exclusive backup can be taken during recovery.
9445          */
9446         if (backup_started_in_recovery && exclusive)
9447                 ereport(ERROR,
9448                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9449                                  errmsg("recovery is in progress"),
9450                                  errhint("WAL control functions cannot be executed during recovery.")));
9451
9452         /*
9453          * During recovery, we don't need to check WAL level. Because, if WAL
9454          * level is not sufficient, it's impossible to get here during recovery.
9455          */
9456         if (!backup_started_in_recovery && !XLogIsNeeded())
9457                 ereport(ERROR,
9458                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9459                           errmsg("WAL level not sufficient for making an online backup"),
9460                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
9461
9462         /*
9463          * OK to update backup counters and forcePageWrites
9464          */
9465         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9466         if (exclusive)
9467                 XLogCtl->Insert.exclusiveBackup = false;
9468         else
9469         {
9470                 /*
9471                  * The user-visible pg_start/stop_backup() functions that operate on
9472                  * exclusive backups can be called at any time, but for non-exclusive
9473                  * backups, it is expected that each do_pg_start_backup() call is
9474                  * matched by exactly one do_pg_stop_backup() call.
9475                  */
9476                 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9477                 XLogCtl->Insert.nonExclusiveBackups--;
9478         }
9479
9480         if (!XLogCtl->Insert.exclusiveBackup &&
9481                 XLogCtl->Insert.nonExclusiveBackups == 0)
9482         {
9483                 XLogCtl->Insert.forcePageWrites = false;
9484         }
9485         LWLockRelease(WALInsertLock);
9486
9487         if (exclusive)
9488         {
9489                 /*
9490                  * Read the existing label file into memory.
9491                  */
9492                 struct stat statbuf;
9493                 int                     r;
9494
9495                 if (stat(BACKUP_LABEL_FILE, &statbuf))
9496                 {
9497                         if (errno != ENOENT)
9498                                 ereport(ERROR,
9499                                                 (errcode_for_file_access(),
9500                                                  errmsg("could not stat file \"%s\": %m",
9501                                                                 BACKUP_LABEL_FILE)));
9502                         ereport(ERROR,
9503                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9504                                          errmsg("a backup is not in progress")));
9505                 }
9506
9507                 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9508                 if (!lfp)
9509                 {
9510                         ereport(ERROR,
9511                                         (errcode_for_file_access(),
9512                                          errmsg("could not read file \"%s\": %m",
9513                                                         BACKUP_LABEL_FILE)));
9514                 }
9515                 labelfile = palloc(statbuf.st_size + 1);
9516                 r = fread(labelfile, statbuf.st_size, 1, lfp);
9517                 labelfile[statbuf.st_size] = '\0';
9518
9519                 /*
9520                  * Close and remove the backup label file
9521                  */
9522                 if (r != 1 || ferror(lfp) || FreeFile(lfp))
9523                         ereport(ERROR,
9524                                         (errcode_for_file_access(),
9525                                          errmsg("could not read file \"%s\": %m",
9526                                                         BACKUP_LABEL_FILE)));
9527                 if (unlink(BACKUP_LABEL_FILE) != 0)
9528                         ereport(ERROR,
9529                                         (errcode_for_file_access(),
9530                                          errmsg("could not remove file \"%s\": %m",
9531                                                         BACKUP_LABEL_FILE)));
9532         }
9533
9534         /*
9535          * Read and parse the START WAL LOCATION line (this code is pretty crude,
9536          * but we are not expecting any variability in the file format).
9537          */
9538         if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
9539                            &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
9540                            &ch) != 4 || ch != '\n')
9541                 ereport(ERROR,
9542                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9543                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9544         remaining = strchr(labelfile, '\n') + 1;        /* %n is not portable enough */
9545
9546         /*
9547          * Parse the BACKUP FROM line. If we are taking an online backup from the
9548          * standby, we confirm that the standby has not been promoted during the
9549          * backup.
9550          */
9551         ptr = strstr(remaining, "BACKUP FROM:");
9552         if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
9553                 ereport(ERROR,
9554                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9555                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9556         if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
9557                 ereport(ERROR,
9558                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9559                                  errmsg("the standby was promoted during online backup"),
9560                                  errhint("This means that the backup being taken is corrupt "
9561                                                  "and should not be used. "
9562                                                  "Try taking another online backup.")));
9563
9564         /*
9565          * During recovery, we don't write an end-of-backup record. We assume that
9566          * pg_control was backed up last and its minimum recovery point can be
9567          * available as the backup end location. Since we don't have an
9568          * end-of-backup record, we use the pg_control value to check whether
9569          * we've reached the end of backup when starting recovery from this
9570          * backup. We have no way of checking if pg_control wasn't backed up last
9571          * however.
9572          *
9573          * We don't force a switch to new WAL file and wait for all the required
9574          * files to be archived. This is okay if we use the backup to start the
9575          * standby. But, if it's for an archive recovery, to ensure all the
9576          * required files are available, a user should wait for them to be
9577          * archived, or include them into the backup.
9578          *
9579          * We return the current minimum recovery point as the backup end
9580          * location. Note that it's would be bigger than the exact backup end
9581          * location if the minimum recovery point is updated since the backup of
9582          * pg_control. This is harmless for current uses.
9583          *
9584          * XXX currently a backup history file is for informational and debug
9585          * purposes only. It's not essential for an online backup. Furthermore,
9586          * even if it's created, it will not be archived during recovery because
9587          * an archiver is not invoked. So it doesn't seem worthwhile to write a
9588          * backup history file during recovery.
9589          */
9590         if (backup_started_in_recovery)
9591         {
9592                 /* use volatile pointer to prevent code rearrangement */
9593                 volatile XLogCtlData *xlogctl = XLogCtl;
9594                 XLogRecPtr      recptr;
9595
9596                 /*
9597                  * Check to see if all WAL replayed during online backup contain
9598                  * full-page writes.
9599                  */
9600                 SpinLockAcquire(&xlogctl->info_lck);
9601                 recptr = xlogctl->lastFpwDisableRecPtr;
9602                 SpinLockRelease(&xlogctl->info_lck);
9603
9604                 if (XLByteLE(startpoint, recptr))
9605                         ereport(ERROR,
9606                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9607                            errmsg("WAL generated with full_page_writes=off was replayed "
9608                                           "during online backup"),
9609                                  errhint("This means that the backup being taken on standby "
9610                                                  "is corrupt and should not be used. "
9611                                  "Enable full_page_writes and run CHECKPOINT on the master, "
9612                                                  "and then try an online backup again.")));
9613
9614
9615                 LWLockAcquire(ControlFileLock, LW_SHARED);
9616                 stoppoint = ControlFile->minRecoveryPoint;
9617                 LWLockRelease(ControlFileLock);
9618
9619                 return stoppoint;
9620         }
9621
9622         /*
9623          * Write the backup-end xlog record
9624          */
9625         rdata.data = (char *) (&startpoint);
9626         rdata.len = sizeof(startpoint);
9627         rdata.buffer = InvalidBuffer;
9628         rdata.next = NULL;
9629         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
9630
9631         /*
9632          * Force a switch to a new xlog segment file, so that the backup is valid
9633          * as soon as archiver moves out the current segment file.
9634          */
9635         RequestXLogSwitch();
9636
9637         XLByteToPrevSeg(stoppoint, _logId, _logSeg);
9638         XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);
9639
9640         /* Use the log timezone here, not the session timezone */
9641         stamp_time = (pg_time_t) time(NULL);
9642         pg_strftime(strfbuf, sizeof(strfbuf),
9643                                 "%Y-%m-%d %H:%M:%S %Z",
9644                                 pg_localtime(&stamp_time, log_timezone));
9645
9646         /*
9647          * Write the backup history file
9648          */
9649         XLByteToSeg(startpoint, _logId, _logSeg);
9650         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
9651                                                   startpoint.xrecoff % XLogSegSize);
9652         fp = AllocateFile(histfilepath, "w");
9653         if (!fp)
9654                 ereport(ERROR,
9655                                 (errcode_for_file_access(),
9656                                  errmsg("could not create file \"%s\": %m",
9657                                                 histfilepath)));
9658         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
9659                         startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
9660         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
9661                         stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
9662         /* transfer remaining lines from label to history file */
9663         fprintf(fp, "%s", remaining);
9664         fprintf(fp, "STOP TIME: %s\n", strfbuf);
9665         if (fflush(fp) || ferror(fp) || FreeFile(fp))
9666                 ereport(ERROR,
9667                                 (errcode_for_file_access(),
9668                                  errmsg("could not write file \"%s\": %m",
9669                                                 histfilepath)));
9670
9671         /*
9672          * Clean out any no-longer-needed history files.  As a side effect, this
9673          * will post a .ready file for the newly created history file, notifying
9674          * the archiver that history file may be archived immediately.
9675          */
9676         CleanupBackupHistory();
9677
9678         /*
9679          * If archiving is enabled, wait for all the required WAL files to be
9680          * archived before returning. If archiving isn't enabled, the required WAL
9681          * needs to be transported via streaming replication (hopefully with
9682          * wal_keep_segments set high enough), or some more exotic mechanism like
9683          * polling and copying files from pg_xlog with script. We have no
9684          * knowledge of those mechanisms, so it's up to the user to ensure that he
9685          * gets all the required WAL.
9686          *
9687          * We wait until both the last WAL file filled during backup and the
9688          * history file have been archived, and assume that the alphabetic sorting
9689          * property of the WAL files ensures any earlier WAL files are safely
9690          * archived as well.
9691          *
9692          * We wait forever, since archive_command is supposed to work and we
9693          * assume the admin wanted his backup to work completely. If you don't
9694          * wish to wait, you can set statement_timeout.  Also, some notices are
9695          * issued to clue in anyone who might be doing this interactively.
9696          */
9697         if (waitforarchive && XLogArchivingActive())
9698         {
9699                 XLByteToPrevSeg(stoppoint, _logId, _logSeg);
9700                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
9701
9702                 XLByteToSeg(startpoint, _logId, _logSeg);
9703                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
9704                                                           startpoint.xrecoff % XLogSegSize);
9705
9706                 seconds_before_warning = 60;
9707                 waits = 0;
9708
9709                 while (XLogArchiveIsBusy(lastxlogfilename) ||
9710                            XLogArchiveIsBusy(histfilename))
9711                 {
9712                         CHECK_FOR_INTERRUPTS();
9713
9714                         if (!reported_waiting && waits > 5)
9715                         {
9716                                 ereport(NOTICE,
9717                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
9718                                 reported_waiting = true;
9719                         }
9720
9721                         pg_usleep(1000000L);
9722
9723                         if (++waits >= seconds_before_warning)
9724                         {
9725                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
9726                                 ereport(WARNING,
9727                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
9728                                                                 waits),
9729                                                  errhint("Check that your archive_command is executing properly.  "
9730                                                                  "pg_stop_backup can be canceled safely, "
9731                                                                  "but the database backup will not be usable without all the WAL segments.")));
9732                         }
9733                 }
9734
9735                 ereport(NOTICE,
9736                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
9737         }
9738         else if (waitforarchive)
9739                 ereport(NOTICE,
9740                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
9741
9742         /*
9743          * We're done.  As a convenience, return the ending WAL location.
9744          */
9745         return stoppoint;
9746 }
9747
9748
9749 /*
9750  * do_pg_abort_backup: abort a running backup
9751  *
9752  * This does just the most basic steps of do_pg_stop_backup(), by taking the
9753  * system out of backup mode, thus making it a lot more safe to call from
9754  * an error handler.
9755  *
9756  * NB: This is only for aborting a non-exclusive backup that doesn't write
9757  * backup_label. A backup started with pg_stop_backup() needs to be finished
9758  * with pg_stop_backup().
9759  */
9760 void
9761 do_pg_abort_backup(void)
9762 {
9763         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9764         Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
9765         XLogCtl->Insert.nonExclusiveBackups--;
9766
9767         if (!XLogCtl->Insert.exclusiveBackup &&
9768                 XLogCtl->Insert.nonExclusiveBackups == 0)
9769         {
9770                 XLogCtl->Insert.forcePageWrites = false;
9771         }
9772         LWLockRelease(WALInsertLock);
9773 }
9774
9775 /*
9776  * Get latest redo apply position.
9777  *
9778  * Optionally, returns the end byte position of the last restored
9779  * WAL segment. Callers not interested in that value may pass
9780  * NULL for restoreLastRecPtr.
9781  *
9782  * Exported to allow WALReceiver to read the pointer directly.
9783  */
9784 XLogRecPtr
9785 GetXLogReplayRecPtr(XLogRecPtr *restoreLastRecPtr)
9786 {
9787         /* use volatile pointer to prevent code rearrangement */
9788         volatile XLogCtlData *xlogctl = XLogCtl;
9789         XLogRecPtr      recptr;
9790
9791         SpinLockAcquire(&xlogctl->info_lck);
9792         recptr = xlogctl->recoveryLastRecPtr;
9793         if (restoreLastRecPtr)
9794                 *restoreLastRecPtr = xlogctl->restoreLastRecPtr;
9795         SpinLockRelease(&xlogctl->info_lck);
9796
9797         return recptr;
9798 }
9799
9800 /*
9801  * Get current standby flush position, ie, the last WAL position
9802  * known to be fsync'd to disk in standby.
9803  */
9804 XLogRecPtr
9805 GetStandbyFlushRecPtr(void)
9806 {
9807         XLogRecPtr      receivePtr;
9808         XLogRecPtr      replayPtr;
9809         XLogRecPtr      restorePtr;
9810
9811         receivePtr = GetWalRcvWriteRecPtr(NULL);
9812         replayPtr = GetXLogReplayRecPtr(&restorePtr);
9813
9814         if (XLByteLT(receivePtr, replayPtr))
9815                 return XLByteLT(replayPtr, restorePtr) ? restorePtr : replayPtr;
9816         else
9817                 return XLByteLT(receivePtr, restorePtr) ? restorePtr : receivePtr;
9818 }
9819
9820 /*
9821  * Get latest WAL insert pointer
9822  */
9823 XLogRecPtr
9824 GetXLogInsertRecPtr(void)
9825 {
9826         XLogCtlInsert *Insert = &XLogCtl->Insert;
9827         XLogRecPtr      current_recptr;
9828
9829         LWLockAcquire(WALInsertLock, LW_SHARED);
9830         INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
9831         LWLockRelease(WALInsertLock);
9832
9833         return current_recptr;
9834 }
9835
9836 /*
9837  * Get latest WAL write pointer
9838  */
9839 XLogRecPtr
9840 GetXLogWriteRecPtr(void)
9841 {
9842         {
9843                 /* use volatile pointer to prevent code rearrangement */
9844                 volatile XLogCtlData *xlogctl = XLogCtl;
9845
9846                 SpinLockAcquire(&xlogctl->info_lck);
9847                 LogwrtResult = xlogctl->LogwrtResult;
9848                 SpinLockRelease(&xlogctl->info_lck);
9849         }
9850
9851         return LogwrtResult.Write;
9852 }
9853
9854 /*
9855  * read_backup_label: check to see if a backup_label file is present
9856  *
9857  * If we see a backup_label during recovery, we assume that we are recovering
9858  * from a backup dump file, and we therefore roll forward from the checkpoint
9859  * identified by the label file, NOT what pg_control says.      This avoids the
9860  * problem that pg_control might have been archived one or more checkpoints
9861  * later than the start of the dump, and so if we rely on it as the start
9862  * point, we will fail to restore a consistent database state.
9863  *
9864  * Returns TRUE if a backup_label was found (and fills the checkpoint
9865  * location and its REDO location into *checkPointLoc and RedoStartLSN,
9866  * respectively); returns FALSE if not. If this backup_label came from a
9867  * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
9868  * was created during recovery, *backupFromStandby is set to TRUE.
9869  */
9870 static bool
9871 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
9872                                   bool *backupFromStandby)
9873 {
9874         char            startxlogfilename[MAXFNAMELEN];
9875         TimeLineID      tli;
9876         FILE       *lfp;
9877         char            ch;
9878         char            backuptype[20];
9879         char            backupfrom[20];
9880
9881         *backupEndRequired = false;
9882         *backupFromStandby = false;
9883
9884         /*
9885          * See if label file is present
9886          */
9887         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9888         if (!lfp)
9889         {
9890                 if (errno != ENOENT)
9891                         ereport(FATAL,
9892                                         (errcode_for_file_access(),
9893                                          errmsg("could not read file \"%s\": %m",
9894                                                         BACKUP_LABEL_FILE)));
9895                 return false;                   /* it's not there, all is fine */
9896         }
9897
9898         /*
9899          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
9900          * is pretty crude, but we are not expecting any variability in the file
9901          * format).
9902          */
9903         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
9904                            &RedoStartLSN.xlogid, &RedoStartLSN.xrecoff, &tli,
9905                            startxlogfilename, &ch) != 5 || ch != '\n')
9906                 ereport(FATAL,
9907                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9908                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9909         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
9910                            &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
9911                            &ch) != 3 || ch != '\n')
9912                 ereport(FATAL,
9913                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9914                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9915
9916         /*
9917          * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
9918          * from an older backup anyway, but since the information on it is not
9919          * strictly required, don't error out if it's missing for some reason.
9920          */
9921         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
9922         {
9923                 if (strcmp(backuptype, "streamed") == 0)
9924                         *backupEndRequired = true;
9925         }
9926
9927         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
9928         {
9929                 if (strcmp(backupfrom, "standby") == 0)
9930                         *backupFromStandby = true;
9931         }
9932
9933         if (ferror(lfp) || FreeFile(lfp))
9934                 ereport(FATAL,
9935                                 (errcode_for_file_access(),
9936                                  errmsg("could not read file \"%s\": %m",
9937                                                 BACKUP_LABEL_FILE)));
9938
9939         return true;
9940 }
9941
9942 /*
9943  * Error context callback for errors occurring during rm_redo().
9944  */
9945 static void
9946 rm_redo_error_callback(void *arg)
9947 {
9948         XLogRecord *record = (XLogRecord *) arg;
9949         StringInfoData buf;
9950
9951         initStringInfo(&buf);
9952         RmgrTable[record->xl_rmid].rm_desc(&buf,
9953                                                                            record->xl_info,
9954                                                                            XLogRecGetData(record));
9955
9956         /* don't bother emitting empty description */
9957         if (buf.len > 0)
9958                 errcontext("xlog redo %s", buf.data);
9959
9960         pfree(buf.data);
9961 }
9962
9963 /*
9964  * BackupInProgress: check if online backup mode is active
9965  *
9966  * This is done by checking for existence of the "backup_label" file.
9967  */
9968 bool
9969 BackupInProgress(void)
9970 {
9971         struct stat stat_buf;
9972
9973         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
9974 }
9975
9976 /*
9977  * CancelBackup: rename the "backup_label" file to cancel backup mode
9978  *
9979  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
9980  * Note that this will render an online backup in progress useless.
9981  * To correctly finish an online backup, pg_stop_backup must be called.
9982  */
9983 void
9984 CancelBackup(void)
9985 {
9986         struct stat stat_buf;
9987
9988         /* if the file is not there, return */
9989         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
9990                 return;
9991
9992         /* remove leftover file from previously canceled backup if it exists */
9993         unlink(BACKUP_LABEL_OLD);
9994
9995         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
9996         {
9997                 ereport(LOG,
9998                                 (errmsg("online backup mode canceled"),
9999                                  errdetail("\"%s\" was renamed to \"%s\".",
10000                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10001         }
10002         else
10003         {
10004                 ereport(WARNING,
10005                                 (errcode_for_file_access(),
10006                                  errmsg("online backup mode was not canceled"),
10007                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
10008                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
10009         }
10010 }
10011
10012 /*
10013  * Read the XLOG page containing RecPtr into readBuf (if not read already).
10014  * Returns true if the page is read successfully.
10015  *
10016  * This is responsible for restoring files from archive as needed, as well
10017  * as for waiting for the requested WAL record to arrive in standby mode.
10018  *
10019  * 'emode' specifies the log level used for reporting "file not found" or
10020  * "end of WAL" situations in archive recovery, or in standby mode when a
10021  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
10022  * false in those situations, on higher log levels the ereport() won't
10023  * return.
10024  *
10025  * In standby mode, if after a successful return of XLogPageRead() the
10026  * caller finds the record it's interested in to be broken, it should
10027  * ereport the error with the level determined by
10028  * emode_for_corrupt_record(), and then set "failedSources |= readSource"
10029  * and call XLogPageRead() again with the same arguments. This lets
10030  * XLogPageRead() to try fetching the record from another source, or to
10031  * sleep and retry.
10032  */
10033 static bool
10034 XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
10035                          bool randAccess)
10036 {
10037         static XLogRecPtr receivedUpto = {0, 0};
10038         bool            switched_segment = false;
10039         uint32          targetPageOff;
10040         uint32          targetRecOff;
10041         uint32          targetId;
10042         uint32          targetSeg;
10043         static pg_time_t last_fail_time = 0;
10044
10045         XLByteToSeg(*RecPtr, targetId, targetSeg);
10046         targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
10047         targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
10048
10049         /* Fast exit if we have read the record in the current buffer already */
10050         if (failedSources == 0 && targetId == readId && targetSeg == readSeg &&
10051                 targetPageOff == readOff && targetRecOff < readLen)
10052                 return true;
10053
10054         /*
10055          * See if we need to switch to a new segment because the requested record
10056          * is not in the currently open one.
10057          */
10058         if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
10059         {
10060                 /*
10061                  * Request a restartpoint if we've replayed too much xlog since the
10062                  * last one.
10063                  */
10064                 if (StandbyMode && bgwriterLaunched)
10065                 {
10066                         if (XLogCheckpointNeeded(readId, readSeg))
10067                         {
10068                                 (void) GetRedoRecPtr();
10069                                 if (XLogCheckpointNeeded(readId, readSeg))
10070                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
10071                         }
10072                 }
10073
10074                 close(readFile);
10075                 readFile = -1;
10076                 readSource = 0;
10077         }
10078
10079         XLByteToSeg(*RecPtr, readId, readSeg);
10080
10081 retry:
10082         /* See if we need to retrieve more data */
10083         if (readFile < 0 ||
10084                 (readSource == XLOG_FROM_STREAM && !XLByteLT(*RecPtr, receivedUpto)))
10085         {
10086                 if (StandbyMode)
10087                 {
10088                         /*
10089                          * In standby mode, wait for the requested record to become
10090                          * available, either via restore_command succeeding to restore the
10091                          * segment, or via walreceiver having streamed the record.
10092                          */
10093                         for (;;)
10094                         {
10095                                 if (WalRcvInProgress())
10096                                 {
10097                                         bool            havedata;
10098
10099                                         /*
10100                                          * If we find an invalid record in the WAL streamed from
10101                                          * master, something is seriously wrong. There's little
10102                                          * chance that the problem will just go away, but PANIC is
10103                                          * not good for availability either, especially in hot
10104                                          * standby mode. Disconnect, and retry from
10105                                          * archive/pg_xlog again. The WAL in the archive should be
10106                                          * identical to what was streamed, so it's unlikely that
10107                                          * it helps, but one can hope...
10108                                          */
10109                                         if (failedSources & XLOG_FROM_STREAM)
10110                                         {
10111                                                 ShutdownWalRcv();
10112                                                 continue;
10113                                         }
10114
10115                                         /*
10116                                          * Walreceiver is active, so see if new data has arrived.
10117                                          *
10118                                          * We only advance XLogReceiptTime when we obtain fresh
10119                                          * WAL from walreceiver and observe that we had already
10120                                          * processed everything before the most recent "chunk"
10121                                          * that it flushed to disk.  In steady state where we are
10122                                          * keeping up with the incoming data, XLogReceiptTime will
10123                                          * be updated on each cycle.  When we are behind,
10124                                          * XLogReceiptTime will not advance, so the grace time
10125                                          * alloted to conflicting queries will decrease.
10126                                          */
10127                                         if (XLByteLT(*RecPtr, receivedUpto))
10128                                                 havedata = true;
10129                                         else
10130                                         {
10131                                                 XLogRecPtr      latestChunkStart;
10132
10133                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
10134                                                 if (XLByteLT(*RecPtr, receivedUpto))
10135                                                 {
10136                                                         havedata = true;
10137                                                         if (!XLByteLT(*RecPtr, latestChunkStart))
10138                                                         {
10139                                                                 XLogReceiptTime = GetCurrentTimestamp();
10140                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
10141                                                         }
10142                                                 }
10143                                                 else
10144                                                         havedata = false;
10145                                         }
10146                                         if (havedata)
10147                                         {
10148                                                 /*
10149                                                  * Great, streamed far enough. Open the file if it's
10150                                                  * not open already.  Use XLOG_FROM_STREAM so that
10151                                                  * source info is set correctly and XLogReceiptTime
10152                                                  * isn't changed.
10153                                                  */
10154                                                 if (readFile < 0)
10155                                                 {
10156                                                         readFile =
10157                                                                 XLogFileRead(readId, readSeg, PANIC,
10158                                                                                          recoveryTargetTLI,
10159                                                                                          XLOG_FROM_STREAM, false);
10160                                                         Assert(readFile >= 0);
10161                                                         switched_segment = true;
10162                                                 }
10163                                                 else
10164                                                 {
10165                                                         /* just make sure source info is correct... */
10166                                                         readSource = XLOG_FROM_STREAM;
10167                                                         XLogReceiptSource = XLOG_FROM_STREAM;
10168                                                 }
10169                                                 break;
10170                                         }
10171
10172                                         /*
10173                                          * Data not here yet, so check for trigger then sleep for
10174                                          * five seconds like in the WAL file polling case below.
10175                                          */
10176                                         if (CheckForStandbyTrigger())
10177                                                 goto retry;
10178
10179                                         /*
10180                                          * Wait for more WAL to arrive, or timeout to be reached
10181                                          */
10182                                         WaitLatch(&XLogCtl->recoveryWakeupLatch,
10183                                                           WL_LATCH_SET | WL_TIMEOUT,
10184                                                           5000L);
10185                                         ResetLatch(&XLogCtl->recoveryWakeupLatch);
10186                                 }
10187                                 else
10188                                 {
10189                                         int                     sources;
10190                                         pg_time_t       now;
10191
10192                                         /*
10193                                          * Until walreceiver manages to reconnect, poll the
10194                                          * archive.
10195                                          */
10196                                         if (readFile >= 0)
10197                                         {
10198                                                 close(readFile);
10199                                                 readFile = -1;
10200                                         }
10201                                         /* Reset curFileTLI if random fetch. */
10202                                         if (randAccess)
10203                                                 curFileTLI = 0;
10204
10205                                         /*
10206                                          * Try to restore the file from archive, or read an
10207                                          * existing file from pg_xlog.
10208                                          */
10209                                         sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG;
10210                                         if (!(sources & ~failedSources))
10211                                         {
10212                                                 /*
10213                                                  * We've exhausted all options for retrieving the
10214                                                  * file. Retry.
10215                                                  */
10216                                                 failedSources = 0;
10217
10218                                                 /*
10219                                                  * Before we sleep, re-scan for possible new timelines
10220                                                  * if we were requested to recover to the latest
10221                                                  * timeline.
10222                                                  */
10223                                                 if (recoveryTargetIsLatest)
10224                                                 {
10225                                                         if (rescanLatestTimeLine())
10226                                                                 continue;
10227                                                 }
10228
10229                                                 /*
10230                                                  * If it hasn't been long since last attempt, sleep to
10231                                                  * avoid busy-waiting.
10232                                                  */
10233                                                 now = (pg_time_t) time(NULL);
10234                                                 if ((now - last_fail_time) < 5)
10235                                                 {
10236                                                         pg_usleep(1000000L * (5 - (now - last_fail_time)));
10237                                                         now = (pg_time_t) time(NULL);
10238                                                 }
10239                                                 last_fail_time = now;
10240
10241                                                 /*
10242                                                  * If primary_conninfo is set, launch walreceiver to
10243                                                  * try to stream the missing WAL, before retrying to
10244                                                  * restore from archive/pg_xlog.
10245                                                  *
10246                                                  * If fetching_ckpt is TRUE, RecPtr points to the
10247                                                  * initial checkpoint location. In that case, we use
10248                                                  * RedoStartLSN as the streaming start position
10249                                                  * instead of RecPtr, so that when we later jump
10250                                                  * backwards to start redo at RedoStartLSN, we will
10251                                                  * have the logs streamed already.
10252                                                  */
10253                                                 if (PrimaryConnInfo)
10254                                                 {
10255                                                         RequestXLogStreaming(
10256                                                                           fetching_ckpt ? RedoStartLSN : *RecPtr,
10257                                                                                                  PrimaryConnInfo);
10258                                                         continue;
10259                                                 }
10260                                         }
10261                                         /* Don't try to read from a source that just failed */
10262                                         sources &= ~failedSources;
10263                                         readFile = XLogFileReadAnyTLI(readId, readSeg, DEBUG2,
10264                                                                                                   sources);
10265                                         switched_segment = true;
10266                                         if (readFile >= 0)
10267                                                 break;
10268
10269                                         /*
10270                                          * Nope, not found in archive and/or pg_xlog.
10271                                          */
10272                                         failedSources |= sources;
10273
10274                                         /*
10275                                          * Check to see if the trigger file exists. Note that we
10276                                          * do this only after failure, so when you create the
10277                                          * trigger file, we still finish replaying as much as we
10278                                          * can from archive and pg_xlog before failover.
10279                                          */
10280                                         if (CheckForStandbyTrigger())
10281                                                 goto triggered;
10282                                 }
10283
10284                                 /*
10285                                  * This possibly-long loop needs to handle interrupts of
10286                                  * startup process.
10287                                  */
10288                                 HandleStartupProcInterrupts();
10289                         }
10290                 }
10291                 else
10292                 {
10293                         /* In archive or crash recovery. */
10294                         if (readFile < 0)
10295                         {
10296                                 int                     sources;
10297
10298                                 /* Reset curFileTLI if random fetch. */
10299                                 if (randAccess)
10300                                         curFileTLI = 0;
10301
10302                                 sources = XLOG_FROM_PG_XLOG;
10303                                 if (InArchiveRecovery)
10304                                         sources |= XLOG_FROM_ARCHIVE;
10305
10306                                 readFile = XLogFileReadAnyTLI(readId, readSeg, emode,
10307                                                                                           sources);
10308                                 switched_segment = true;
10309                                 if (readFile < 0)
10310                                         return false;
10311                         }
10312                 }
10313         }
10314
10315         /*
10316          * At this point, we have the right segment open and if we're streaming we
10317          * know the requested record is in it.
10318          */
10319         Assert(readFile != -1);
10320
10321         /*
10322          * If the current segment is being streamed from master, calculate how
10323          * much of the current page we have received already. We know the
10324          * requested record has been received, but this is for the benefit of
10325          * future calls, to allow quick exit at the top of this function.
10326          */
10327         if (readSource == XLOG_FROM_STREAM)
10328         {
10329                 if (RecPtr->xlogid != receivedUpto.xlogid ||
10330                         (RecPtr->xrecoff / XLOG_BLCKSZ) != (receivedUpto.xrecoff / XLOG_BLCKSZ))
10331                 {
10332                         readLen = XLOG_BLCKSZ;
10333                 }
10334                 else
10335                         readLen = receivedUpto.xrecoff % XLogSegSize - targetPageOff;
10336         }
10337         else
10338                 readLen = XLOG_BLCKSZ;
10339
10340         if (switched_segment && targetPageOff != 0)
10341         {
10342                 /*
10343                  * Whenever switching to a new WAL segment, we read the first page of
10344                  * the file and validate its header, even if that's not where the
10345                  * target record is.  This is so that we can check the additional
10346                  * identification info that is present in the first page's "long"
10347                  * header.
10348                  */
10349                 readOff = 0;
10350                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10351                 {
10352                         ereport(emode_for_corrupt_record(emode, *RecPtr),
10353                                         (errcode_for_file_access(),
10354                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
10355                                                         readId, readSeg, readOff)));
10356                         goto next_record_is_invalid;
10357                 }
10358                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
10359                         goto next_record_is_invalid;
10360         }
10361
10362         /* Read the requested page */
10363         readOff = targetPageOff;
10364         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
10365         {
10366                 ereport(emode_for_corrupt_record(emode, *RecPtr),
10367                                 (errcode_for_file_access(),
10368                  errmsg("could not seek in log file %u, segment %u to offset %u: %m",
10369                                 readId, readSeg, readOff)));
10370                 goto next_record_is_invalid;
10371         }
10372         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
10373         {
10374                 ereport(emode_for_corrupt_record(emode, *RecPtr),
10375                                 (errcode_for_file_access(),
10376                  errmsg("could not read from log file %u, segment %u, offset %u: %m",
10377                                 readId, readSeg, readOff)));
10378                 goto next_record_is_invalid;
10379         }
10380         if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
10381                 goto next_record_is_invalid;
10382
10383         Assert(targetId == readId);
10384         Assert(targetSeg == readSeg);
10385         Assert(targetPageOff == readOff);
10386         Assert(targetRecOff < readLen);
10387
10388         return true;
10389
10390 next_record_is_invalid:
10391         failedSources |= readSource;
10392
10393         if (readFile >= 0)
10394                 close(readFile);
10395         readFile = -1;
10396         readLen = 0;
10397         readSource = 0;
10398
10399         /* In standby-mode, keep trying */
10400         if (StandbyMode)
10401                 goto retry;
10402         else
10403                 return false;
10404
10405 triggered:
10406         if (readFile >= 0)
10407                 close(readFile);
10408         readFile = -1;
10409         readLen = 0;
10410         readSource = 0;
10411
10412         return false;
10413 }
10414
10415 /*
10416  * Determine what log level should be used to report a corrupt WAL record
10417  * in the current WAL page, previously read by XLogPageRead().
10418  *
10419  * 'emode' is the error mode that would be used to report a file-not-found
10420  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
10421  * we're retrying the exact same record that we've tried previously, only
10422  * complain the first time to keep the noise down.      However, we only do when
10423  * reading from pg_xlog, because we don't expect any invalid records in archive
10424  * or in records streamed from master. Files in the archive should be complete,
10425  * and we should never hit the end of WAL because we stop and wait for more WAL
10426  * to arrive before replaying it.
10427  *
10428  * NOTE: This function remembers the RecPtr value it was last called with,
10429  * to suppress repeated messages about the same record. Only call this when
10430  * you are about to ereport(), or you might cause a later message to be
10431  * erroneously suppressed.
10432  */
10433 static int
10434 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
10435 {
10436         static XLogRecPtr lastComplaint = {0, 0};
10437
10438         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
10439         {
10440                 if (XLByteEQ(RecPtr, lastComplaint))
10441                         emode = DEBUG1;
10442                 else
10443                         lastComplaint = RecPtr;
10444         }
10445         return emode;
10446 }
10447
10448 /*
10449  * Check to see whether the user-specified trigger file exists and whether a
10450  * promote request has arrived.  If either condition holds, request postmaster
10451  * to shut down walreceiver, wait for it to exit, and return true.
10452  */
10453 static bool
10454 CheckForStandbyTrigger(void)
10455 {
10456         struct stat stat_buf;
10457         static bool triggered = false;
10458
10459         if (triggered)
10460                 return true;
10461
10462         if (IsPromoteTriggered())
10463         {
10464                 ereport(LOG,
10465                                 (errmsg("received promote request")));
10466                 ShutdownWalRcv();
10467                 ResetPromoteTriggered();
10468                 triggered = true;
10469                 return true;
10470         }
10471
10472         if (TriggerFile == NULL)
10473                 return false;
10474
10475         if (stat(TriggerFile, &stat_buf) == 0)
10476         {
10477                 ereport(LOG,
10478                                 (errmsg("trigger file found: %s", TriggerFile)));
10479                 ShutdownWalRcv();
10480                 unlink(TriggerFile);
10481                 triggered = true;
10482                 return true;
10483         }
10484         return false;
10485 }
10486
10487 /*
10488  * Check to see if a promote request has arrived. Should be
10489  * called by postmaster after receiving SIGUSR1.
10490  */
10491 bool
10492 CheckPromoteSignal(void)
10493 {
10494         struct stat stat_buf;
10495
10496         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
10497         {
10498                 /*
10499                  * Since we are in a signal handler, it's not safe to elog. We
10500                  * silently ignore any error from unlink.
10501                  */
10502                 unlink(PROMOTE_SIGNAL_FILE);
10503                 return true;
10504         }
10505         return false;
10506 }
10507
10508 /*
10509  * Wake up startup process to replay newly arrived WAL, or to notice that
10510  * failover has been requested.
10511  */
10512 void
10513 WakeupRecovery(void)
10514 {
10515         SetLatch(&XLogCtl->recoveryWakeupLatch);
10516 }
10517
10518 /*
10519  * Update the WalWriterSleeping flag.
10520  */
10521 void
10522 SetWalWriterSleeping(bool sleeping)
10523 {
10524         /* use volatile pointer to prevent code rearrangement */
10525         volatile XLogCtlData *xlogctl = XLogCtl;
10526
10527         SpinLockAcquire(&xlogctl->info_lck);
10528         xlogctl->WalWriterSleeping = sleeping;
10529         SpinLockRelease(&xlogctl->info_lck);
10530 }