]> granicus.if.org Git - postgresql/blobdiff - src/backend/access/transam/xlog.c
Have multixact be truncated by checkpoint, not vacuum
[postgresql] / src / backend / access / transam / xlog.c
index acf0dd187619b03a2fc08387fd61301b05d6dd3c..e5640793eb8e09355b09d13cbdb68d3a9773ae6b 100644 (file)
@@ -4,7 +4,7 @@
  *             PostgreSQL transaction log manager
  *
  *
- * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * src/backend/access/transam/xlog.c
@@ -23,6 +23,7 @@
 
 #include "access/clog.h"
 #include "access/multixact.h"
+#include "access/rewriteheap.h"
 #include "access/subtrans.h"
 #include "access/timeline.h"
 #include "access/transam.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
+#include "replication/logical.h"
+#include "replication/slot.h"
+#include "replication/snapbuild.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/barrier.h"
 #include "storage/bufmgr.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
+#include "storage/large_object.h"
 #include "storage/latch.h"
 #include "storage/pmsignal.h"
 #include "storage/predicate.h"
@@ -66,8 +71,8 @@ extern uint32 bootstrap_data_checksum_version;
 /* File path names (all relative to $PGDATA) */
 #define RECOVERY_COMMAND_FILE  "recovery.conf"
 #define RECOVERY_COMMAND_DONE  "recovery.done"
-#define PROMOTE_SIGNAL_FILE "promote"
-#define FAST_PROMOTE_SIGNAL_FILE "fast_promote"
+#define PROMOTE_SIGNAL_FILE            "promote"
+#define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
 
 
 /* User-settable parameters */
@@ -79,12 +84,13 @@ bool                XLogArchiveMode = false;
 char      *XLogArchiveCommand = NULL;
 bool           EnableHotStandby = false;
 bool           fullPageWrites = true;
+bool           wal_log_hints = false;
 bool           log_checkpoints = false;
 int                    sync_method = DEFAULT_SYNC_METHOD;
 int                    wal_level = WAL_LEVEL_MINIMAL;
 int                    CommitDelay = 0;        /* precommit delay in microseconds */
 int                    CommitSiblings = 5; /* # concurrent xacts needed to sleep */
-int                    num_xloginsert_slots = 8;
+int                    num_xloginsert_locks = 8;
 
 #ifdef WAL_DEBUG
 bool           XLOG_DEBUG = false;
@@ -96,7 +102,7 @@ bool         XLOG_DEBUG = false;
  * future XLOG segment as long as there aren't already XLOGfileslop future
  * segments; else we'll delete it.  This could be made a separate GUC
  * variable, but at present I think it's sufficient to hardwire it as
- * 2*CheckPointSegments+1.     Under normal conditions, a checkpoint will free
+ * 2*CheckPointSegments+1.  Under normal conditions, a checkpoint will free
  * no more than 2*CheckPointSegments log segments, and we want to recycle all
  * of them; the +1 allows boundary cases to happen without wasting a
  * delete/create-segment cycle.
@@ -126,7 +132,7 @@ const struct config_enum_entry sync_method_options[] = {
 
 /*
  * Statistics for current checkpoint are collected in this global struct.
- * Because only the background writer or a stand-alone backend can perform
+ * Because only the checkpointer or a stand-alone backend can perform
  * checkpoints, this will be unused in normal backends.
  */
 CheckpointStatsData CheckpointStats;
@@ -185,7 +191,7 @@ static bool LocalHotStandbyActive = false;
  *             0: unconditionally not allowed to insert XLOG
  *             -1: must check RecoveryInProgress(); disallow until it is false
  * Most processes start with -1 and transition to 1 after seeing that recovery
- * is not in progress. But we can also force the value for special cases.
+ * is not in progress.  But we can also force the value for special cases.
  * The coding in XLogInsertAllowed() depends on the first two of these states
  * being numerically the same as bool true and false.
  */
@@ -218,10 +224,13 @@ static bool recoveryPauseAtTarget = true;
 static TransactionId recoveryTargetXid;
 static TimestampTz recoveryTargetTime;
 static char *recoveryTargetName;
+static int     recovery_min_apply_delay = 0;
+static TimestampTz recoveryDelayUntilTime;
 
 /* options taken from recovery.conf for XLOG streaming */
 static bool StandbyModeRequested = false;
 static char *PrimaryConnInfo = NULL;
+static char *PrimarySlotName = NULL;
 static char *TriggerFile = NULL;
 
 /* are we currently in standby mode? */
@@ -230,7 +239,10 @@ bool               StandbyMode = false;
 /* whether request for fast promotion has been made yet */
 static bool fast_promote = false;
 
-/* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
+/*
+ * if recoveryStopsBefore/After returns true, it saves information of the stop
+ * point here
+ */
 static TransactionId recoveryStopXid;
 static TimestampTz recoveryStopTime;
 static char recoveryStopName[MAXFNAMELEN];
@@ -250,7 +262,7 @@ static bool recoveryStopAfter;
  *
  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
  * its known parents, newest first (so recoveryTargetTLI is always the
- * first list member). Only these TLIs are expected to be seen in the WAL
+ * first list member).  Only these TLIs are expected to be seen in the WAL
  * segments we read, and indeed only these TLIs will be considered as
  * candidate WAL files to open at all.
  *
@@ -279,9 +291,9 @@ XLogRecPtr  XactLastRecEnd = InvalidXLogRecPtr;
 /*
  * RedoRecPtr is this backend's local copy of the REDO record pointer
  * (which is almost but not quite the same as a pointer to the most recent
- * CHECKPOINT record). We update this from the shared-memory copy,
+ * CHECKPOINT record).  We update this from the shared-memory copy,
  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
- * hold an insertion slot).  See XLogInsert for details.  We are also allowed
+ * hold an insertion lock).  See XLogInsert for details.  We are also allowed
  * to update from XLogCtl->RedoRecPtr if we hold the info_lck;
  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
  * InitXLOGAccess.
@@ -353,63 +365,51 @@ typedef struct XLogwrtResult
        XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 } XLogwrtResult;
 
-
 /*
- * A slot for inserting to the WAL. This is similar to an LWLock, the main
- * difference is that there is an extra xlogInsertingAt field that is protected
- * by the same mutex. Unlike an LWLock, a slot can only be acquired in
- * exclusive mode.
- *
- * The xlogInsertingAt field is used to advertise to other processes how far
- * the slot owner has progressed in inserting the record. When a backend
- * acquires a slot, it initializes xlogInsertingAt to 1, because it doesn't
- * yet know where it's going to insert the record. That's conservative
- * but correct; the new insertion is certainly going to go to a byte position
- * greater than 1. If another backend needs to flush the WAL, it will have to
- * wait for the new insertion. xlogInsertingAt is updated after finishing the
- * insert or when crossing a page boundary, which will wake up anyone waiting
- * for it, whether the wait was necessary in the first place or not.
- *
- * A process can wait on a slot in two modes: LW_EXCLUSIVE or
- * LW_WAIT_UNTIL_FREE. LW_EXCLUSIVE works like in an lwlock; when the slot is
- * released, the first LW_EXCLUSIVE waiter in the queue is woken up. Processes
- * waiting in LW_WAIT_UNTIL_FREE mode are woken up whenever the slot is
- * released, or xlogInsertingAt is updated. In other words, a process in
- * LW_WAIT_UNTIL_FREE mode is woken up whenever the inserter makes any progress
- * copying the record in place. LW_WAIT_UNTIL_FREE waiters are always added to
- * the front of the queue, while LW_EXCLUSIVE waiters are appended to the end.
- *
- * To join the wait queue, a process must set MyProc->lwWaitMode to the mode
- * it wants to wait in, MyProc->lwWaiting to true, and link MyProc to the head
- * or tail of the wait queue. The same mechanism is used to wait on an LWLock,
- * see lwlock.c for details.
+ * Inserting to WAL is protected by a small fixed number of WAL insertion
+ * locks. To insert to the WAL, you must hold one of the locks - it doesn't
+ * matter which one. To lock out other concurrent insertions, you must hold
+ * of them. Each WAL insertion lock consists of a lightweight lock, plus an
+ * indicator of how far the insertion has progressed (insertingAt).
+ *
+ * The insertingAt values are read when a process wants to flush WAL from
+ * the in-memory buffers to disk, to check that all the insertions to the
+ * region the process is about to write out have finished. You could simply
+ * wait for all currently in-progress insertions to finish, but the
+ * insertingAt indicator allows you to ignore insertions to later in the WAL,
+ * so that you only wait for the insertions that are modifying the buffers
+ * you're about to write out.
+ *
+ * This isn't just an optimization. If all the WAL buffers are dirty, an
+ * inserter that's holding a WAL insert lock might need to evict an old WAL
+ * buffer, which requires flushing the WAL. If it's possible for an inserter
+ * to block on another inserter unnecessarily, deadlock can arise when two
+ * inserters holding a WAL insert lock wait for each other to finish their
+ * insertion.
+ *
+ * Small WAL records that don't cross a page boundary never update the value,
+ * the WAL record is just copied to the page and the lock is released. But
+ * to avoid the deadlock-scenario explained above, the indicator is always
+ * updated before sleeping while holding an insertion lock.
  */
 typedef struct
 {
-       slock_t         mutex;                  /* protects the below fields */
-       XLogRecPtr      xlogInsertingAt; /* insert has completed up to this point */
-
-       PGPROC     *owner;                      /* for debugging purposes */
-
-       bool            releaseOK;              /* T if ok to release waiters */
-       char            exclusive;              /* # of exclusive holders (0 or 1) */
-       PGPROC     *head;                       /* head of list of waiting PGPROCs */
-       PGPROC     *tail;                       /* tail of list of waiting PGPROCs */
-       /* tail is undefined when head is NULL */
-} XLogInsertSlot;
+       LWLock          lock;
+       XLogRecPtr      insertingAt;
+} WALInsertLock;
 
 /*
- * All the slots are allocated as an array in shared memory. We force the
- * array stride to be a power of 2, which saves a few cycles in indexing, but
- * more importantly also ensures that individual slots don't cross cache line
- * boundaries. (Of course, we have to also ensure that the array start
- * address is suitably aligned.)
+ * All the WAL insertion locks are allocated as an array in shared memory. We
+ * force the array stride to be a power of 2, which saves a few cycles in
+ * indexing, but more importantly also ensures that individual slots don't
+ * cross cache line boundaries. (Of course, we have to also ensure that the
+ * array start address is suitably aligned.)
  */
-typedef union XLogInsertSlotPadded
+typedef union WALInsertLockPadded
 {
-       XLogInsertSlot slot;
-       char            pad[64];
-} XLogInsertSlotPadded;
+       WALInsertLock l;
+       char            pad[CACHE_LINE_SIZE];
+} WALInsertLockPadded;
 
 /*
  * Shared state data for XLogInsert.
@@ -419,17 +419,23 @@ typedef struct XLogCtlInsert
        slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
 
        /*
-        * CurrBytePos is the end of reserved WAL. The next record will be inserted
-        * at that position. PrevBytePos is the start position of the previously
-        * inserted (or rather, reserved) record - it is copied to the the prev-
-        * link of the next record. These are stored as "usable byte positions"
-        * rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
+        * CurrBytePos is the end of reserved WAL. The next record will be
+        * inserted at that position. PrevBytePos is the start position of the
+        * previously inserted (or rather, reserved) record - it is copied to the
+        * prev-link of the next record. These are stored as "usable byte
+        * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
         */
        uint64          CurrBytePos;
        uint64          PrevBytePos;
 
-       /* insertion slots, see above for details */
-       XLogInsertSlotPadded *insertSlots;
+       /*
+        * Make sure the above heavily-contended spinlock and byte positions are
+        * on their own cache line. In particular, the RedoRecPtr and full page
+        * write variables below should be on a different cache line. They are
+        * read on every WAL insertion, but updated rarely, and we don't want
+        * those reads to steal the cache line containing Curr/PrevBytePos.
+        */
+       char            pad[CACHE_LINE_SIZE];
 
        /*
         * fullPageWrites is the master copy used by all backends to determine
@@ -438,8 +444,8 @@ typedef struct XLogCtlInsert
         * we must WAL-log it before it actually affects WAL-logging by backends.
         * Checkpointer sets at startup or after SIGHUP.
         *
-        * To read these fields, you must hold an insertion slot. To modify them,
-        * you must hold ALL the slots.
+        * To read these fields, you must hold an insertion lock. To modify them,
+        * you must hold ALL the locks.
         */
        XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
        bool            forcePageWrites;        /* forcing full-page writes for PITR? */
@@ -455,16 +461,14 @@ typedef struct XLogCtlInsert
        bool            exclusiveBackup;
        int                     nonExclusiveBackups;
        XLogRecPtr      lastBackupStart;
-} XLogCtlInsert;
 
-/*
- * Shared state data for XLogWrite/XLogFlush.
- */
-typedef struct XLogCtlWrite
-{
-       int                     curridx;                /* cache index of next block to write */
-       pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
-} XLogCtlWrite;
+       /*
+        * WAL insertion locks.
+        */
+       WALInsertLockPadded *WALInsertLocks;
+       LWLockTranche WALInsertLockTranche;
+       int                     WALInsertLockTrancheId;
+} XLogCtlInsert;
 
 /*
  * Total shared-memory state for XLOG.
@@ -479,15 +483,17 @@ typedef struct XLogCtlData
        uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
        TransactionId ckptXid;
        XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
+       XLogRecPtr      replicationSlotMinLSN;  /* oldest LSN needed by any slot */
+
        XLogSegNo       lastRemovedSegNo;               /* latest removed/recycled XLOG
                                                                                 * segment */
 
-       /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck */
+       /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
        XLogRecPtr      unloggedLSN;
        slock_t         ulsn_lck;
 
-       /* Protected by WALWriteLock: */
-       XLogCtlWrite Write;
+       /* Time of last xlog segment switch. Protected by WALWriteLock. */
+       pg_time_t       lastSegSwitchTime;
 
        /*
         * Protected by info_lck and WALWriteLock (you must hold either lock to
@@ -496,15 +502,16 @@ typedef struct XLogCtlData
        XLogwrtResult LogwrtResult;
 
        /*
-        * Latest initialized block index in cache.
+        * Latest initialized page in the cache (last byte position + 1).
         *
-        * To change curridx and the identity of a buffer, you need to hold
-        * WALBufMappingLock.  To change the identity of a buffer that's still
-        * dirty, the old page needs to be written out first, and for that you
-        * need WALWriteLock, and you need to ensure that there are no in-progress
-        * insertions to the page by calling WaitXLogInsertionsToFinish().
+        * To change the identity of a buffer (and InitializedUpTo), you need to
+        * hold WALBufMappingLock.  To change the identity of a buffer that's
+        * still dirty, the old page needs to be written out first, and for that
+        * you need WALWriteLock, and you need to ensure that there are no
+        * in-progress insertions to the page by calling
+        * WaitXLogInsertionsToFinish().
         */
-       int                     curridx;
+       XLogRecPtr      InitializedUpTo;
 
        /*
         * These values do not change after startup, although the pointed-to pages
@@ -599,6 +606,9 @@ typedef struct XLogCtlData
 
 static XLogCtlData *XLogCtl = NULL;
 
+/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
+static WALInsertLockPadded *WALInsertLocks = NULL;
+
 /*
  * We maintain an image of pg_control in shared memory.
  */
@@ -618,16 +628,10 @@ static ControlFileData *ControlFile = NULL;
 /*
  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
  * would hold if it was in cache, the page containing 'recptr'.
- *
- * XLogRecEndPtrToBufIdx is the same, but a pointer to the first byte of a
- * page is taken to mean the previous page.
  */
 #define XLogRecPtrToBufIdx(recptr)     \
        (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
 
-#define XLogRecEndPtrToBufIdx(recptr)  \
-       ((((recptr) - 1) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
-
 /*
  * These are the number of bytes in a WAL page and segment usable for WAL data.
  */
@@ -649,7 +653,7 @@ typedef enum
        XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
        XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
        XLOG_FROM_PG_XLOG,                      /* existing file in pg_xlog */
-       XLOG_FROM_STREAM,                       /* streamed from master */
+       XLOG_FROM_STREAM                        /* streamed from master */
 } XLogSource;
 
 /* human-readable names for XLogSources, for debugging output */
@@ -728,14 +732,16 @@ static bool InRedo = false;
 /* Have we launched bgwriter during recovery? */
 static bool bgwriterLaunched = false;
 
-/* For WALInsertSlotAcquire/Release functions */
-static int     MySlotNo = 0;
-static bool holdingAllSlots = false;
+/* For WALInsertLockAcquire/Release functions */
+static int     MyLockNo = 0;
+static bool holdingAllLocks = false;
 
 static void readRecoveryCommandFile(void);
 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
-static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
+static bool recoveryStopsBefore(XLogRecord *record);
+static bool recoveryStopsAfter(XLogRecord *record);
 static void recoveryPausesHere(void);
+static bool recoveryApplyDelay(XLogRecord *record);
 static void SetLatestXTime(TimestampTz xtime);
 static void SetCurrentChunkStartTime(TimestampTz xtime);
 static void CheckRequiredParameterValues(void);
@@ -746,6 +752,7 @@ static void LocalSetXLogInsertAllowed(void);
 static void CreateEndOfRecoveryRecord(void);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
+static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 
 static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
                                XLogRecPtr *lsn, BkpBlock *bkpb);
@@ -794,23 +801,22 @@ static void rm_redo_error_callback(void *arg);
 static int     get_sync_bit(int method);
 
 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
-                                 XLogRecData *rdata,
-                                 XLogRecPtr StartPos, XLogRecPtr EndPos);
+                                       XLogRecData *rdata,
+                                       XLogRecPtr StartPos, XLogRecPtr EndPos);
 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
                                                  XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
                                  XLogRecPtr *PrevPtr);
 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
-static void WakeupWaiters(XLogRecPtr EndPos);
 static char *GetXLogBuffer(XLogRecPtr ptr);
 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
 
-static void WALInsertSlotAcquire(bool exclusive);
-static void WALInsertSlotAcquireOne(int slotno);
-static void WALInsertSlotRelease(void);
-static void WALInsertSlotReleaseOne(int slotno);
+static void WALInsertLockAcquire(void);
+static void WALInsertLockAcquireExclusive(void);
+static void WALInsertLockRelease(void);
+static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
 
 /*
  * Insert an XLOG record having the specified RMID and info bytes,
@@ -855,9 +861,9 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 
        if (rechdr == NULL)
        {
-               rechdr = malloc(SizeOfXLogRecord);
-               if (rechdr == NULL)
-                       elog(ERROR, "out of memory");
+               static char rechdrbuf[SizeOfXLogRecord + MAXIMUM_ALIGNOF];
+
+               rechdr = (XLogRecord *) MAXALIGN(&rechdrbuf);
                MemSet(rechdr, 0, SizeOfXLogRecord);
        }
 
@@ -887,7 +893,7 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
         *
         * We may have to loop back to here if a race condition is detected below.
         * We could prevent the race by doing all this work while holding an
-        * insertion slot, but it seems better to avoid doing CRC calculations
+        * insertion lock, but it seems better to avoid doing CRC calculations
         * while holding one.
         *
         * We add entries for backup blocks to the chain, so that they don't need
@@ -905,8 +911,8 @@ begin:;
        /*
         * Decide if we need to do full-page writes in this XLOG record: true if
         * full_page_writes is on or we have a PITR request for it.  Since we
-        * don't yet have an insertion slot, fullPageWrites and forcePageWrites
-        * could change under us, but we'll recheck them once we have a slot.
+        * don't yet have an insertion lock, fullPageWrites and forcePageWrites
+        * could change under us, but we'll recheck them once we have a lock.
         */
        doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
 
@@ -1072,24 +1078,23 @@ begin:;
         * record to the shared WAL buffer cache is a two-step process:
         *
         * 1. Reserve the right amount of space from the WAL. The current head of
-        *    reserved space is kept in Insert->CurrBytePos, and is protected by
-        *    insertpos_lck.
+        *        reserved space is kept in Insert->CurrBytePos, and is protected by
+        *        insertpos_lck.
         *
         * 2. Copy the record to the reserved WAL space. This involves finding the
-        *    correct WAL buffer containing the reserved space, and copying the
-        *    record in place. This can be done concurrently in multiple processes.
+        *        correct WAL buffer containing the reserved space, and copying the
+        *        record in place. This can be done concurrently in multiple processes.
         *
         * To keep track of which insertions are still in-progress, each concurrent
-        * inserter allocates an "insertion slot", which tells others how far the
-        * inserter has progressed. There is a small fixed number of insertion
-        * slots, determined by the num_xloginsert_slots GUC. When an inserter
-        * finishes, it updates the xlogInsertingAt of its slot to the end of the
-        * record it inserted, to let others know that it's done. xlogInsertingAt
-        * is also updated when crossing over to a new WAL buffer, to allow the
-        * the previous buffer to be flushed.
+        * inserter acquires an insertion lock. In addition to just indicating that
+        * an insertion is in progress, the lock tells others how far the inserter
+        * has progressed. There is a small fixed number of insertion locks,
+        * determined by the num_xloginsert_locks GUC. When an inserter crosses a
+        * page boundary, it updates the value stored in the lock to the how far it
+        * has inserted, to allow the previous buffer to be flushed.
         *
-        * Holding onto a slot also protects RedoRecPtr and fullPageWrites from
-        * changing until the insertion is finished.
+        * Holding onto an insertion lock also protects RedoRecPtr and
+        * fullPageWrites from changing until the insertion is finished.
         *
         * Step 2 can usually be done completely in parallel. If the required WAL
         * page is not initialized yet, you have to grab WALBufMappingLock to
@@ -1099,7 +1104,10 @@ begin:;
         *----------
         */
        START_CRIT_SECTION();
-       WALInsertSlotAcquire(isLogSwitch);
+       if (isLogSwitch)
+               WALInsertLockAcquireExclusive();
+       else
+               WALInsertLockAcquire();
 
        /*
         * Check to see if my RedoRecPtr is out of date.  If so, may have to go
@@ -1128,7 +1136,7 @@ begin:;
                                         * Oops, this buffer now needs to be backed up, but we
                                         * didn't think so above.  Start over.
                                         */
-                                       WALInsertSlotRelease();
+                                       WALInsertLockRelease();
                                        END_CRIT_SECTION();
                                        rdt_lastnormal->next = NULL;
                                        info = info_orig;
@@ -1147,7 +1155,7 @@ begin:;
        if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
        {
                /* Oops, must redo it with full-page data. */
-               WALInsertSlotRelease();
+               WALInsertLockRelease();
                END_CRIT_SECTION();
                rdt_lastnormal->next = NULL;
                info = info_orig;
@@ -1195,7 +1203,9 @@ begin:;
        /*
         * Done! Let others know that we're finished.
         */
-       WALInsertSlotRelease();
+       WALInsertLockRelease();
+
+       MarkCurrentTransactionIdLoggedIfAny();
 
        END_CRIT_SECTION();
 
@@ -1225,6 +1235,7 @@ begin:;
        {
                TRACE_POSTGRESQL_XLOG_SWITCH();
                XLogFlush(EndPos);
+
                /*
                 * Even though we reserved the rest of the segment for us, which is
                 * reflected in EndPos, we return a pointer to just the end of the
@@ -1254,8 +1265,24 @@ begin:;
                xlog_outrec(&buf, rechdr);
                if (rdata->data != NULL)
                {
-                       appendStringInfo(&buf, " - ");
-                       RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
+                       StringInfoData recordbuf;
+
+                       /*
+                        * We have to piece together the WAL record data from the
+                        * XLogRecData entries, so that we can pass it to the rm_desc
+                        * function as one contiguous chunk. (but we can leave out any
+                        * extra entries we created for backup blocks)
+                        */
+                       rdt_lastnormal->next = NULL;
+
+                       initStringInfo(&recordbuf);
+                       appendBinaryStringInfo(&recordbuf, (char *) rechdr, sizeof(XLogRecord));
+                       for (; rdata != NULL; rdata = rdata->next)
+                               appendBinaryStringInfo(&recordbuf, rdata->data, rdata->len);
+
+                       appendStringInfoString(&buf, " - ");
+                       RmgrTable[rechdr->xl_rmid].rm_desc(&buf, (XLogRecord *) recordbuf.data);
+                       pfree(recordbuf.data);
                }
                elog(LOG, "%s", buf.data);
                pfree(buf.data);
@@ -1337,8 +1364,8 @@ ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
  *
  * A log-switch record is handled slightly differently. The rest of the
  * segment will be reserved for this insertion, as indicated by the returned
- * *EndPos_p value. However, if we are already at the beginning of the current
- * segment, *StartPos_p and *EndPos_p are set to the current location without
+ * *EndPos value. However, if we are already at the beginning of the current
+ * segment, *StartPos and *EndPos are set to the current location without
  * reserving any space, and the function returns false.
 */
 static bool
@@ -1354,7 +1381,7 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
 
        /*
         * These calculations are a bit heavy-weight to be done while holding a
-        * spinlock, but since we're holding all the WAL insertion slots, there
+        * spinlock, but since we're holding all the WAL insertion locks, there
         * are no other inserters competing for it. GetXLogInsertRecPtr() does
         * compete for it, but that's not called very frequently.
         */
@@ -1488,12 +1515,12 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
        Assert(written == write_len);
 
        /* Align the end position, so that the next record starts aligned */
-       CurrPos = MAXALIGN(CurrPos);
+       CurrPos = MAXALIGN64(CurrPos);
 
        /*
         * If this was an xlog-switch, it's not enough to write the switch record,
-        * we also have to consume all the remaining space in the WAL segment.
-        * We have already reserved it for us, but we still need to make sure it's
+        * we also have to consume all the remaining space in the WAL segment. We
+        * have already reserved it for us, but we still need to make sure it's
         * allocated and zeroed in the WAL buffers so that when the caller (or
         * someone else) does XLogWrite(), it can really write out all the zeros.
         */
@@ -1514,7 +1541,7 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
                while (CurrPos < EndPos)
                {
                        /* initialize the next page (if not initialized already) */
-                       WakeupWaiters(CurrPos);
+                       WALInsertLockUpdateInsertingAt(CurrPos);
                        AdvanceXLInsertBuffer(CurrPos, false);
                        CurrPos += XLOG_BLCKSZ;
                }
@@ -1525,449 +1552,123 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
 }
 
 /*
- * Allocate a slot for insertion.
- *
- * In exclusive mode, all slots are reserved for the current process. That
- * blocks all concurrent insertions.
- */
-static void
-WALInsertSlotAcquire(bool exclusive)
-{
-       int                     i;
-
-       if (exclusive)
-       {
-               for (i = 0; i < num_xloginsert_slots; i++)
-                       WALInsertSlotAcquireOne(i);
-               holdingAllSlots = true;
-       }
-       else
-               WALInsertSlotAcquireOne(-1);
-}
-
-/*
- * Workhorse of WALInsertSlotAcquire. Acquires the given slot, or an arbitrary
- * one if slotno == -1. The index of the slot that was acquired is stored in
- * MySlotNo.
- *
- * This is more or less equivalent to LWLockAcquire().
+ * Acquire a WAL insertion lock, for inserting to WAL.
  */
 static void
-WALInsertSlotAcquireOne(int slotno)
+WALInsertLockAcquire(void)
 {
-       volatile XLogInsertSlot *slot;
-       PGPROC     *proc = MyProc;
-       bool            retry = false;
-       int                     extraWaits = 0;
-       static int      slotToTry = -1;
+       bool            immed;
 
        /*
-        * Try to use the slot we used last time. If the system isn't particularly
-        * busy, it's a good bet that it's available, and it's good to have some
-        * affinity to a particular slot so that you don't unnecessarily bounce
-        * cache lines between processes when there is no contention.
+        * It doesn't matter which of the WAL insertion locks we acquire, so try
+        * the one we used last time.  If the system isn't particularly busy, it's
+        * a good bet that it's still available, and it's good to have some
+        * affinity to a particular lock so that you don't unnecessarily bounce
+        * cache lines between processes when there's no contention.
         *
-        * If this is the first time through in this backend, pick a slot
-        * (semi-)randomly. This allows the slots to be used evenly if you have a
+        * If this is the first time through in this backend, pick a lock
+        * (semi-)randomly.  This allows the locks to be used evenly if you have a
         * lot of very short connections.
         */
-       if (slotno != -1)
-               MySlotNo = slotno;
-       else
-       {
-               if (slotToTry == -1)
-                       slotToTry = MyProc->pgprocno % num_xloginsert_slots;
-               MySlotNo = slotToTry;
-       }
-
-       /*
-        * We can't wait if we haven't got a PGPROC.  This should only occur
-        * during bootstrap or shared memory initialization.  Put an Assert here
-        * to catch unsafe coding practices.
-        */
-       Assert(MyProc != NULL);
+       static int      lockToTry = -1;
 
-       /*
-        * Lock out cancel/die interrupts until we exit the code section protected
-        * by the slot.  This ensures that interrupts will not interfere with
-        * manipulations of data structures in shared memory.
-        */
-       START_CRIT_SECTION();
+       if (lockToTry == -1)
+               lockToTry = MyProc->pgprocno % num_xloginsert_locks;
+       MyLockNo = lockToTry;
 
        /*
-        * Loop here to try to acquire slot after each time we are signaled by
-        * WALInsertSlotRelease.
+        * The insertingAt value is initially set to 0, as we don't know our
+        * insert location yet.
         */
-       for (;;)
+       immed = LWLockAcquireWithVar(&WALInsertLocks[MyLockNo].l.lock,
+                                                                &WALInsertLocks[MyLockNo].l.insertingAt,
+                                                                0);
+       if (!immed)
        {
-               bool            mustwait;
-
-               slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
-
-               /* Acquire mutex.  Time spent holding mutex should be short! */
-               SpinLockAcquire(&slot->mutex);
-
-               /* If retrying, allow WALInsertSlotRelease to release waiters again */
-               if (retry)
-                       slot->releaseOK = true;
-
-               /* If I can get the slot, do so quickly. */
-               if (slot->exclusive == 0)
-               {
-                       slot->exclusive++;
-                       mustwait = false;
-               }
-               else
-                       mustwait = true;
-
-               if (!mustwait)
-                       break;                          /* got the lock */
-
-               Assert(slot->owner != MyProc);
-
                /*
-                * Add myself to wait queue.
+                * If we couldn't get the lock immediately, try another lock next
+                * time.  On a system with more insertion locks than concurrent
+                * inserters, this causes all the inserters to eventually migrate to a
+                * lock that no-one else is using.  On a system with more inserters
+                * than locks, it still helps to distribute the inserters evenly
+                * across the locks.
                 */
-               proc->lwWaiting = true;
-               proc->lwWaitMode = LW_EXCLUSIVE;
-               proc->lwWaitLink = NULL;
-               if (slot->head == NULL)
-                       slot->head = proc;
-               else
-                       slot->tail->lwWaitLink = proc;
-               slot->tail = proc;
-
-               /* Can release the mutex now */
-               SpinLockRelease(&slot->mutex);
-
-               /*
-                * Wait until awakened.
-                *
-                * Since we share the process wait semaphore with the regular lock
-                * manager and ProcWaitForSignal, and we may need to acquire a slot
-                * while one of those is pending, it is possible that we get awakened
-                * for a reason other than being signaled by WALInsertSlotRelease. If
-                * so, loop back and wait again.  Once we've gotten the slot,
-                * re-increment the sema by the number of additional signals received,
-                * so that the lock manager or signal manager will see the received
-                * signal when it next waits.
-                */
-               for (;;)
-               {
-                       /* "false" means cannot accept cancel/die interrupt here. */
-                       PGSemaphoreLock(&proc->sem, false);
-                       if (!proc->lwWaiting)
-                               break;
-                       extraWaits++;
-               }
-
-               /* Now loop back and try to acquire lock again. */
-               retry = true;
+               lockToTry = (lockToTry + 1) % num_xloginsert_locks;
        }
-
-       slot->owner = proc;
-
-       /*
-        * Normally, we initialize the xlogInsertingAt value of the slot to 1,
-        * because we don't yet know where in the WAL we're going to insert. It's
-        * not critical what it points to right now - leaving it to a too small
-        * value just means that WaitXlogInsertionsToFinish() might wait on us
-        * unnecessarily, until we update the value (when we finish the insert or
-        * move to next page).
-        *
-        * If we're grabbing all the slots, however, stamp all but the last one
-        * with InvalidXLogRecPtr, meaning there is no insert in progress. The last
-        * slot is the one that we will update as we proceed with the insert, the
-        * rest are held just to keep off other inserters.
-        */
-       if (slotno != -1 && slotno != num_xloginsert_slots - 1)
-               slot->xlogInsertingAt = InvalidXLogRecPtr;
-       else
-               slot->xlogInsertingAt = 1;
-
-       /* We are done updating shared state of the slot itself. */
-       SpinLockRelease(&slot->mutex);
-
-       /*
-        * Fix the process wait semaphore's count for any absorbed wakeups.
-        */
-       while (extraWaits-- > 0)
-               PGSemaphoreUnlock(&proc->sem);
-
-       /*
-        * If we couldn't get the slot immediately, try another slot next time.
-        * On a system with more insertion slots than concurrent inserters, this
-        * causes all the inserters to eventually migrate to a slot that no-one
-        * else is using. On a system with more inserters than slots, it still
-        * causes the inserters to be distributed quite evenly across the slots.
-        */
-       if (slotno != -1 && retry)
-               slotToTry = (slotToTry + 1) % num_xloginsert_slots;
 }
 
 /*
- * Wait for the given slot to become free, or for its xlogInsertingAt location
- * to change to something else than 'waitptr'. In other words, wait for the
- * inserter using the given slot to finish its insertion, or to at least make
- * some progress.
+ * Acquire all WAL insertion locks, to prevent other backends from inserting
+ * to WAL.
  */
 static void
-WaitOnSlot(volatile XLogInsertSlot *slot, XLogRecPtr waitptr)
+WALInsertLockAcquireExclusive(void)
 {
-       PGPROC     *proc = MyProc;
-       int                     extraWaits = 0;
-
-       /*
-        * Lock out cancel/die interrupts while we sleep on the slot. There is
-        * no cleanup mechanism to remove us from the wait queue if we got
-        * interrupted.
-        */
-       HOLD_INTERRUPTS();
+       int                     i;
 
        /*
-        * Loop here to try to acquire lock after each time we are signaled.
+        * When holding all the locks, we only update the last lock's insertingAt
+        * indicator.  The others are set to 0xFFFFFFFFFFFFFFFF, which is higher
+        * than any real XLogRecPtr value, to make sure that no-one blocks waiting
+        * on those.
         */
-       for (;;)
+       for (i = 0; i < num_xloginsert_locks - 1; i++)
        {
-               bool            mustwait;
-
-               /* Acquire mutex.  Time spent holding mutex should be short! */
-               SpinLockAcquire(&slot->mutex);
-
-               /* If I can get the lock, do so quickly. */
-               if (slot->exclusive == 0 || slot->xlogInsertingAt != waitptr)
-                       mustwait = false;
-               else
-                       mustwait = true;
-
-               if (!mustwait)
-                       break;                          /* the lock was free */
-
-               Assert(slot->owner != MyProc);
-
-               /*
-                * Add myself to wait queue.
-                */
-               proc->lwWaiting = true;
-               proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
-               proc->lwWaitLink = NULL;
-
-               /* waiters are added to the front of the queue */
-               proc->lwWaitLink = slot->head;
-               if (slot->head == NULL)
-                       slot->tail = proc;
-               slot->head = proc;
-
-               /* Can release the mutex now */
-               SpinLockRelease(&slot->mutex);
-
-               /*
-                * Wait until awakened.
-                *
-                * Since we share the process wait semaphore with other things, like
-                * the regular lock manager and ProcWaitForSignal, and we may need to
-                * acquire an LWLock while one of those is pending, it is possible that
-                * we get awakened for a reason other than being signaled by
-                * LWLockRelease. If so, loop back and wait again.  Once we've gotten
-                * the LWLock, re-increment the sema by the number of additional
-                * signals received, so that the lock manager or signal manager will
-                * see the received signal when it next waits.
-                */
-               for (;;)
-               {
-                       /* "false" means cannot accept cancel/die interrupt here. */
-                       PGSemaphoreLock(&proc->sem, false);
-                       if (!proc->lwWaiting)
-                               break;
-                       extraWaits++;
-               }
-
-               /* Now loop back and try to acquire lock again. */
+               LWLockAcquireWithVar(&WALInsertLocks[i].l.lock,
+                                                        &WALInsertLocks[i].l.insertingAt,
+                                                        UINT64CONST(0xFFFFFFFFFFFFFFFF));
        }
+       LWLockAcquireWithVar(&WALInsertLocks[i].l.lock,
+                                                &WALInsertLocks[i].l.insertingAt,
+                                                0);
 
-       /* We are done updating shared state of the lock itself. */
-       SpinLockRelease(&slot->mutex);
-
-       /*
-        * Fix the process wait semaphore's count for any absorbed wakeups.
-        */
-       while (extraWaits-- > 0)
-               PGSemaphoreUnlock(&proc->sem);
-
-       /*
-        * Now okay to allow cancel/die interrupts.
-        */
-       RESUME_INTERRUPTS();
+       holdingAllLocks = true;
 }
 
 /*
- * Wake up all processes waiting for us with WaitOnSlot(). Sets our
- * xlogInsertingAt value to EndPos, without releasing the slot.
+ * Release our insertion lock (or locks, if we're holding them all).
  */
 static void
-WakeupWaiters(XLogRecPtr EndPos)
+WALInsertLockRelease(void)
 {
-       volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[MySlotNo].slot;
-       PGPROC     *head;
-       PGPROC     *proc;
-       PGPROC     *next;
-
-       /*
-        * If we have already reported progress up to the same point, do nothing.
-        * No other process can modify xlogInsertingAt, so we can check this before
-        * grabbing the spinlock.
-        */
-       if (slot->xlogInsertingAt == EndPos)
-               return;
-       /* xlogInsertingAt should not go backwards */
-       Assert(slot->xlogInsertingAt < EndPos);
-
-       /* Acquire mutex.  Time spent holding mutex should be short! */
-       SpinLockAcquire(&slot->mutex);
-
-       /* we should own the slot */
-       Assert(slot->exclusive == 1 && slot->owner == MyProc);
-
-       slot->xlogInsertingAt = EndPos;
-
-       /*
-        * See if there are any waiters that need to be woken up.
-        */
-       head = slot->head;
-
-       if (head != NULL)
+       if (holdingAllLocks)
        {
-               proc = head;
+               int                     i;
 
-               /* LW_WAIT_UNTIL_FREE waiters are always in the front of the queue */
-               next = proc->lwWaitLink;
-               while (next && next->lwWaitMode == LW_WAIT_UNTIL_FREE)
-               {
-                       proc = next;
-                       next = next->lwWaitLink;
-               }
+               for (i = 0; i < num_xloginsert_locks; i++)
+                       LWLockRelease(&WALInsertLocks[i].l.lock);
 
-               /* proc is now the last PGPROC to be released */
-               slot->head = next;
-               proc->lwWaitLink = NULL;
+               holdingAllLocks = false;
        }
-
-       /* We are done updating shared state of the lock itself. */
-       SpinLockRelease(&slot->mutex);
-
-       /*
-        * Awaken any waiters I removed from the queue.
-        */
-       while (head != NULL)
+       else
        {
-               proc = head;
-               head = proc->lwWaitLink;
-               proc->lwWaitLink = NULL;
-               proc->lwWaiting = false;
-               PGSemaphoreUnlock(&proc->sem);
+               LWLockRelease(&WALInsertLocks[MyLockNo].l.lock);
        }
 }
 
 /*
- * Release our insertion slot (or slots, if we're holding them all).
+ * Update our insertingAt value, to let others know that we've finished
+ * inserting up to that point.
  */
 static void
-WALInsertSlotRelease(void)
+WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
 {
-       int                     i;
-
-       if (holdingAllSlots)
+       if (holdingAllLocks)
        {
-               for (i = 0; i < num_xloginsert_slots; i++)
-                       WALInsertSlotReleaseOne(i);
-               holdingAllSlots = false;
+               /*
+                * We use the last lock to mark our actual position, see comments in
+                * WALInsertLockAcquireExclusive.
+                */
+               LWLockUpdateVar(&WALInsertLocks[num_xloginsert_locks - 1].l.lock,
+                                        &WALInsertLocks[num_xloginsert_locks - 1].l.insertingAt,
+                                               insertingAt);
        }
        else
-               WALInsertSlotReleaseOne(MySlotNo);
-}
-
-static void
-WALInsertSlotReleaseOne(int slotno)
-{
-       volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[slotno].slot;
-       PGPROC     *head;
-       PGPROC     *proc;
-
-       /* Acquire mutex.  Time spent holding mutex should be short! */
-       SpinLockAcquire(&slot->mutex);
-
-       /* we must be holding it */
-       Assert(slot->exclusive == 1 && slot->owner == MyProc);
-
-       slot->xlogInsertingAt = InvalidXLogRecPtr;
-
-       /* Release my hold on the slot */
-       slot->exclusive = 0;
-       slot->owner = NULL;
-
-       /*
-        * See if I need to awaken any waiters..
-        */
-       head = slot->head;
-       if (head != NULL)
-       {
-               if (slot->releaseOK)
-               {
-                       /*
-                        * Remove the to-be-awakened PGPROCs from the queue.
-                        */
-                       bool            releaseOK = true;
-
-                       proc = head;
-
-                       /*
-                        * First wake up any backends that want to be woken up without
-                        * acquiring the lock. These are always in the front of the queue.
-                        */
-                       while (proc->lwWaitMode == LW_WAIT_UNTIL_FREE && proc->lwWaitLink)
-                               proc = proc->lwWaitLink;
-
-                       /*
-                        * Awaken the first exclusive-waiter, if any.
-                        */
-                       if (proc->lwWaitLink)
-                       {
-                               Assert(proc->lwWaitLink->lwWaitMode == LW_EXCLUSIVE);
-                               proc = proc->lwWaitLink;
-                               releaseOK = false;
-                       }
-                       /* proc is now the last PGPROC to be released */
-                       slot->head = proc->lwWaitLink;
-                       proc->lwWaitLink = NULL;
-
-                       slot->releaseOK = releaseOK;
-               }
-               else
-                       head = NULL;
-       }
-
-       /* We are done updating shared state of the slot itself. */
-       SpinLockRelease(&slot->mutex);
-
-       /*
-        * Awaken any waiters I removed from the queue.
-        */
-       while (head != NULL)
-       {
-               proc = head;
-               head = proc->lwWaitLink;
-               proc->lwWaitLink = NULL;
-               proc->lwWaiting = false;
-               PGSemaphoreUnlock(&proc->sem);
-       }
-
-       /*
-        * Now okay to allow cancel/die interrupts.
-        */
-       END_CRIT_SECTION();
+               LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
+                                               &WALInsertLocks[MyLockNo].l.insertingAt,
+                                               insertingAt);
 }
 
-
 /*
  * Wait for any WAL insertions < upto to finish.
  *
@@ -2017,79 +1718,50 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto)
        }
 
        /*
-        * finishedUpto is our return value, indicating the point upto which
-        * all the WAL insertions have been finished. Initialize it to the head
-        * of reserved WAL, and as we iterate through the insertion slots, back it
+        * Loop through all the locks, sleeping on any in-progress insert older
+        * than 'upto'.
+        *
+        * finishedUpto is our return value, indicating the point upto which all
+        * the WAL insertions have been finished. Initialize it to the head of
+        * reserved WAL, and as we iterate through the insertion locks, back it
         * out for any insertion that's still in progress.
         */
        finishedUpto = reservedUpto;
-
-       /*
-        * Loop through all the slots, sleeping on any in-progress insert older
-        * than 'upto'.
-        */
-       for (i = 0; i < num_xloginsert_slots; i++)
+       for (i = 0; i < num_xloginsert_locks; i++)
        {
-               volatile XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
-               XLogRecPtr insertingat;
+               XLogRecPtr      insertingat = InvalidXLogRecPtr;
 
-       retry:
-               /*
-                * We can check if the slot is in use without grabbing the spinlock.
-                * The spinlock acquisition of insertpos_lck before this loop acts
-                * as a memory barrier. If someone acquires the slot after that, it
-                * can't possibly be inserting to anything < reservedUpto. If it was
-                * acquired before that, an unlocked test will return true.
-                */
-               if (!slot->exclusive)
-                       continue;
-
-               SpinLockAcquire(&slot->mutex);
-               /* re-check now that we have the lock */
-               if (!slot->exclusive)
-               {
-                       SpinLockRelease(&slot->mutex);
-                       continue;
-               }
-               insertingat = slot->xlogInsertingAt;
-               SpinLockRelease(&slot->mutex);
-
-               if (insertingat == InvalidXLogRecPtr)
+               do
                {
                        /*
-                        * slot is reserved just to hold off other inserters, there is no
-                        * actual insert in progress.
+                        * See if this insertion is in progress. LWLockWait will wait for
+                        * the lock to be released, or for the 'value' to be set by a
+                        * LWLockUpdateVar call.  When a lock is initially acquired, its
+                        * value is 0 (InvalidXLogRecPtr), which means that we don't know
+                        * where it's inserting yet.  We will have to wait for it.  If
+                        * it's a small insertion, the record will most likely fit on the
+                        * same page and the inserter will release the lock without ever
+                        * calling LWLockUpdateVar.  But if it has to sleep, it will
+                        * advertise the insertion point with LWLockUpdateVar before
+                        * sleeping.
                         */
-                       continue;
-               }
+                       if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
+                                                                &WALInsertLocks[i].l.insertingAt,
+                                                                insertingat, &insertingat))
+                       {
+                               /* the lock was free, so no insertion in progress */
+                               insertingat = InvalidXLogRecPtr;
+                               break;
+                       }
 
-               /*
-                * This insertion is still in progress. Do we need to wait for it?
-                *
-                * When an inserter acquires a slot, it doesn't reset 'insertingat', so
-                * it will initially point to the old value of some already-finished
-                * insertion. The inserter will update the value as soon as it finishes
-                * the insertion, moves to the next page, or has to do I/O to flush an
-                * old dirty buffer. That means that when we see a slot with
-                * insertingat value < upto, we don't know if that insertion is still
-                * truly in progress, or if the slot is reused by a new inserter that
-                * hasn't updated the insertingat value yet. We have to assume it's the
-                * latter, and wait.
-                */
-               if (insertingat < upto)
-               {
-                       WaitOnSlot(slot, insertingat);
-                       goto retry;
-               }
-               else
-               {
                        /*
-                        * We don't need to wait for this insertion, but update the
-                        * return value.
+                        * This insertion is still in progress. Have to wait, unless the
+                        * inserter has proceeded past 'upto'.
                         */
-                       if (insertingat < finishedUpto)
-                               finishedUpto = insertingat;
-               }
+               } while (insertingat < upto);
+
+               if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
+                       finishedUpto = insertingat;
        }
        return finishedUpto;
 }
@@ -2103,8 +1775,8 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto)
  *
  * The caller must ensure that the page containing the requested location
  * isn't evicted yet, and won't be evicted. The way to ensure that is to
- * hold onto an XLogInsertSlot with the xlogInsertingAt position set to
- * something <= ptr. GetXLogBuffer() will update xlogInsertingAt if it needs
+ * hold onto a WAL insertion lock with the insertingAt position set to
+ * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
  * to evict an old page from the buffer. (This means that once you call
  * GetXLogBuffer() with a given 'ptr', you must not access anything before
  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
@@ -2131,9 +1803,9 @@ GetXLogBuffer(XLogRecPtr ptr)
        }
 
        /*
-        * The XLog buffer cache is organized so that a page is always loaded
-        * to a particular buffer.  That way we can easily calculate the buffer
-        * a given page must be loaded into, from the XLogRecPtr alone.
+        * The XLog buffer cache is organized so that a page is always loaded to a
+        * particular buffer.  That way we can easily calculate the buffer a given
+        * page must be loaded into, from the XLogRecPtr alone.
         */
        idx = XLogRecPtrToBufIdx(ptr);
 
@@ -2161,17 +1833,17 @@ GetXLogBuffer(XLogRecPtr ptr)
        if (expectedEndPtr != endptr)
        {
                /*
-                * Let others know that we're finished inserting the record up
-                * to the page boundary.
+                * Let others know that we're finished inserting the record up to the
+                * page boundary.
                 */
-               WakeupWaiters(expectedEndPtr - XLOG_BLCKSZ);
+               WALInsertLockUpdateInsertingAt(expectedEndPtr - XLOG_BLCKSZ);
 
                AdvanceXLInsertBuffer(ptr, false);
                endptr = XLogCtl->xlblocks[idx];
 
                if (expectedEndPtr != endptr)
                        elog(PANIC, "could not find WAL buffer for %X/%X",
-                                (uint32) (ptr >> 32) , (uint32) ptr);
+                                (uint32) (ptr >> 32), (uint32) ptr);
        }
        else
        {
@@ -2308,8 +1980,8 @@ XLogRecPtrToBytePos(XLogRecPtr ptr)
        else
        {
                result = fullsegs * UsableBytesInSegment +
-                       (XLOG_BLCKSZ - SizeOfXLogLongPHD) +  /* account for first page */
-                       (fullpages - 1) * UsableBytesInPage; /* full pages */
+                       (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
+                       (fullpages - 1) * UsableBytesInPage;            /* full pages */
                if (offset > 0)
                {
                        Assert(offset >= SizeOfXLogShortPHD);
@@ -2320,6 +1992,29 @@ XLogRecPtrToBytePos(XLogRecPtr ptr)
        return result;
 }
 
+/*
+ * Determine whether the buffer referenced has to be backed up.
+ *
+ * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites
+ * could change later, so the result should be used for optimization purposes
+ * only.
+ */
+bool
+XLogCheckBufferNeedsBackup(Buffer buffer)
+{
+       bool            doPageWrites;
+       Page            page;
+
+       page = BufferGetPage(buffer);
+
+       doPageWrites = XLogCtl->Insert.fullPageWrites || XLogCtl->Insert.forcePageWrites;
+
+       if (doPageWrites && PageGetLSN(page) <= RedoRecPtr)
+               return true;                    /* buffer requires backup */
+
+       return false;                           /* buffer does not need to be backed up */
+}
+
 /*
  * Determine whether the buffer referenced by an XLogRecData item has to
  * be backed up, and if so fill a BkpBlock struct for it.  In any case
@@ -2409,9 +2104,9 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
         * Now that we have the lock, check if someone initialized the page
         * already.
         */
-       while (upto >= XLogCtl->xlblocks[XLogCtl->curridx] || opportunistic)
+       while (upto >= XLogCtl->InitializedUpTo || opportunistic)
        {
-               nextidx = NextBufIdx(XLogCtl->curridx);
+               nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
 
                /*
                 * Get ending-offset of the buffer page we need to replace (this may
@@ -2481,14 +2176,12 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
                }
 
                /*
-                * Now the next buffer slot is free and we can set it up to be the next
-                * output page.
+                * Now the next buffer slot is free and we can set it up to be the
+                * next output page.
                 */
-               NewPageBeginPtr = XLogCtl->xlblocks[XLogCtl->curridx];
+               NewPageBeginPtr = XLogCtl->InitializedUpTo;
                NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
 
-               Assert(NewPageEndPtr % XLOG_BLCKSZ == 0);
-               Assert(XLogRecEndPtrToBufIdx(NewPageEndPtr) == nextidx);
                Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
 
                NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
@@ -2507,7 +2200,8 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
                /* NewPage->xlp_info = 0; */    /* done by memset */
                NewPage   ->xlp_tli = ThisTimeLineID;
                NewPage   ->xlp_pageaddr = NewPageBeginPtr;
-               /* NewPage->xlp_rem_len = 0; */         /* done by memset */
+
+               /* NewPage->xlp_rem_len = 0; */ /* done by memset */
 
                /*
                 * If online backup is not in progress, mark the header to indicate
@@ -2515,12 +2209,12 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
                 * blocks.  This allows the WAL archiver to know whether it is safe to
                 * compress archived WAL data by transforming full-block records into
                 * the non-full-block format.  It is sufficient to record this at the
-                * page level because we force a page switch (in fact a segment switch)
-                * when starting a backup, so the flag will be off before any records
-                * can be written during the backup.  At the end of a backup, the last
-                * page will be marked as all unsafe when perhaps only part is unsafe,
-                * but at worst the archiver would miss the opportunity to compress a
-                * few records.
+                * page level because we force a page switch (in fact a segment
+                * switch) when starting a backup, so the flag will be off before any
+                * records can be written during the backup.  At the end of a backup,
+                * the last page will be marked as all unsafe when perhaps only part
+                * is unsafe, but at worst the archiver would miss the opportunity to
+                * compress a few records.
                 */
                if (!Insert->forcePageWrites)
                        NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
@@ -2547,7 +2241,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
 
                *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
 
-               XLogCtl->curridx = nextidx;
+               XLogCtl->InitializedUpTo = NewPageEndPtr;
 
                npages++;
        }
@@ -2598,7 +2292,6 @@ XLogCheckpointNeeded(XLogSegNo new_segno)
 static void
 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 {
-       XLogCtlWrite *Write = &XLogCtl->Write;
        bool            ispartialpage;
        bool            last_iteration;
        bool            finishing_seg;
@@ -2631,12 +2324,10 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 
        /*
         * Within the loop, curridx is the cache block index of the page to
-        * consider writing.  We advance Write->curridx only after successfully
-        * writing pages.  (Right now, this refinement is useless since we are
-        * going to PANIC if any error occurs anyway; but someday it may come in
-        * useful.)
+        * consider writing.  Begin at the buffer containing the next unwritten
+        * page, or last partially written page.
         */
-       curridx = Write->curridx;
+       curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
 
        while (LogwrtResult.Write < WriteRqst.Write)
        {
@@ -2645,7 +2336,8 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
                 * if we're passed a bogus WriteRqst.Write that is past the end of the
                 * last page that's been initialized by AdvanceXLInsertBuffer.
                 */
-               XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
+               XLogRecPtr      EndPtr = XLogCtl->xlblocks[curridx];
+
                if (LogwrtResult.Write >= EndPtr)
                        elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
                                 (uint32) (LogwrtResult.Write >> 32),
@@ -2729,7 +2421,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
                        do
                        {
                                errno = 0;
-                               written  = write(openLogFile, from, nleft);
+                               written = write(openLogFile, from, nleft);
                                if (written <= 0)
                                {
                                        if (errno == EINTR)
@@ -2737,9 +2429,9 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
                                        ereport(PANIC,
                                                        (errcode_for_file_access(),
                                                         errmsg("could not write to log file %s "
-                                                                       "at offset %u, length %lu: %m",
-                                                                       XLogFileNameP(ThisTimeLineID, openLogSegNo),
-                                                                       openLogOff, (unsigned long) nbytes)));
+                                                                       "at offset %u, length %zu: %m",
+                                                                XLogFileNameP(ThisTimeLineID, openLogSegNo),
+                                                                       openLogOff, nbytes)));
                                }
                                nleft -= written;
                                from += written;
@@ -2747,7 +2439,6 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 
                        /* Update state for write */
                        openLogOff += nbytes;
-                       Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
                        npages = 0;
 
                        /*
@@ -2775,7 +2466,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
                                if (XLogArchivingActive())
                                        XLogArchiveNotifySeg(openLogSegNo);
 
-                               Write->lastSegSwitchTime = (pg_time_t) time(NULL);
+                               XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
 
                                /*
                                 * Request a checkpoint if we've consumed too much xlog since
@@ -2807,7 +2498,6 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
        }
 
        Assert(npages == 0);
-       Assert(curridx == Write->curridx);
 
        /*
         * If asked to flush, do so
@@ -2818,7 +2508,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
        {
                /*
                 * Could get here without iterating above loop, in which case we might
-                * have no open file or the wrong one.  However, we do not need to
+                * have no open file or the wrong one.  However, we do not need to
                 * fsync more than one file.
                 */
                if (sync_method != SYNC_METHOD_OPEN &&
@@ -2887,7 +2577,7 @@ XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
 
        /*
         * If the WALWriter is sleeping, we should kick it to make it come out of
-        * low-power mode.      Otherwise, determine whether there's a full page of
+        * low-power mode.  Otherwise, determine whether there's a full page of
         * WAL available to write.
         */
        if (!sleeping)
@@ -2909,6 +2599,40 @@ XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
                SetLatch(ProcGlobal->walwriterLatch);
 }
 
+/*
+ * Record the LSN up to which we can remove WAL because it's not required by
+ * any replication slot.
+ */
+void
+XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+
+       SpinLockAcquire(&xlogctl->info_lck);
+       xlogctl->replicationSlotMinLSN = lsn;
+       SpinLockRelease(&xlogctl->info_lck);
+}
+
+
+/*
+ * Return the oldest LSN we must retain to satisfy the needs of some
+ * replication slot.
+ */
+static XLogRecPtr
+XLogGetReplicationSlotMinimumLSN(void)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+       XLogRecPtr      retval;
+
+       SpinLockAcquire(&xlogctl->info_lck);
+       retval = xlogctl->replicationSlotMinLSN;
+       SpinLockRelease(&xlogctl->info_lck);
+
+       return retval;
+}
+
 /*
  * Advance minRecoveryPoint in control file.
  *
@@ -3168,9 +2892,9 @@ XLogFlush(XLogRecPtr record)
  * We normally flush only completed blocks; but if there is nothing to do on
  * that basis, we check for unflushed async commits in the current incomplete
  * block, and flush through the latest one of those.  Thus, if async commits
- * are not being used, we will flush complete blocks only.     We can guarantee
+ * are not being used, we will flush complete blocks only.  We can guarantee
  * that async commits reach disk after at most three cycles; normally only
- * one or two. (When flushing complete blocks, we allow XLogWrite to write
+ * one or two.  (When flushing complete blocks, we allow XLogWrite to write
  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
  * difference only with very high load or long wal_writer_delay, but imposes
  * one extra cycle for the worst case for async commits.)
@@ -3345,7 +3069,7 @@ XLogNeedsFlush(XLogRecPtr record)
  * log, seg: identify segment to be created/opened.
  *
  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
- * pre-existing file will be deleted). On return, TRUE if a pre-existing
+ * pre-existing file will be deleted).  On return, TRUE if a pre-existing
  * file was used.
  *
  * use_lock: if TRUE, acquire ControlFileLock while moving file into
@@ -3364,10 +3088,12 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 {
        char            path[MAXPGPATH];
        char            tmppath[MAXPGPATH];
+       char            zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
+       char       *zbuffer;
        XLogSegNo       installed_segno;
        int                     max_advance;
        int                     fd;
-       bool            zero_fill = true;
+       int                     nbytes;
 
        XLogFilePath(path, ThisTimeLineID, logsegno);
 
@@ -3409,65 +3135,41 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
                                (errcode_for_file_access(),
                                 errmsg("could not create file \"%s\": %m", tmppath)));
 
-#ifdef HAVE_POSIX_FALLOCATE
        /*
-        * If posix_fallocate() is available and succeeds, then the file is
-        * properly allocated and we don't need to zero-fill it (which is less
-        * efficient).  In case of an error, fall back to writing zeros, because on
-        * some platforms posix_fallocate() is available but will not always
-        * succeed in cases where zero-filling will.
+        * Zero-fill the file.  We have to do this the hard way to ensure that all
+        * the file space has really been allocated --- on platforms that allow
+        * "holes" in files, just seeking to the end doesn't allocate intermediate
+        * space.  This way, we know that we have all the space and (after the
+        * fsync below) that all the indirect blocks are down on disk.  Therefore,
+        * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
+        * log file.
+        *
+        * Note: ensure the buffer is reasonably well-aligned; this may save a few
+        * cycles transferring data to the kernel.
         */
-       if (posix_fallocate(fd, 0, XLogSegSize) == 0)
-               zero_fill = false;
-#endif /* HAVE_POSIX_FALLOCATE */
-
-       if (zero_fill)
+       zbuffer = (char *) MAXALIGN(zbuffer_raw);
+       memset(zbuffer, 0, XLOG_BLCKSZ);
+       for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
        {
-               /*
-                * Allocate a buffer full of zeros. This is done before opening the
-                * file so that we don't leak the file descriptor if palloc fails.
-                *
-                * Note: palloc zbuffer, instead of just using a local char array, to
-                * ensure it is reasonably well-aligned; this may save a few cycles
-                * transferring data to the kernel.
-                */
-
-               char    *zbuffer = (char *) palloc0(XLOG_BLCKSZ);
-               int              nbytes;
-
-               /*
-                * Zero-fill the file. We have to do this the hard way to ensure that
-                * all the file space has really been allocated --- on platforms that
-                * allow "holes" in files, just seeking to the end doesn't allocate
-                * intermediate space.  This way, we know that we have all the space
-                * and (after the fsync below) that all the indirect blocks are down on
-                * disk. Therefore, fdatasync(2) or O_DSYNC will be sufficient to sync
-                * future writes to the log file.
-                */
-               for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
+               errno = 0;
+               if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
                {
-                       errno = 0;
-                       if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
-                       {
-                               int                     save_errno = errno;
+                       int                     save_errno = errno;
 
-                               /*
-                                * If we fail to make the file, delete it to release disk space
-                                */
-                               unlink(tmppath);
+                       /*
+                        * If we fail to make the file, delete it to release disk space
+                        */
+                       unlink(tmppath);
 
-                               close(fd);
+                       close(fd);
 
-                               /* if write didn't set errno, assume no disk space */
-                               errno = save_errno ? save_errno : ENOSPC;
+                       /* if write didn't set errno, assume problem is no disk space */
+                       errno = save_errno ? save_errno : ENOSPC;
 
-                               ereport(ERROR,
-                                               (errcode_for_file_access(),
-                                                errmsg("could not write to file \"%s\": %m",
-                                                               tmppath)));
-                       }
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not write to file \"%s\": %m", tmppath)));
                }
-               pfree(zbuffer);
        }
 
        if (pg_fsync(fd) != 0)
@@ -3530,7 +3232,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
  *             a different timeline)
  *
  * Currently this is only used during recovery, and so there are no locking
- * considerations.     But we should be just as tense as XLogFileInit to avoid
+ * considerations.  But we should be just as tense as XLogFileInit to avoid
  * emplacing a bogus file.
  */
 static void
@@ -3741,7 +3443,7 @@ XLogFileOpen(XLogSegNo segno)
        if (fd < 0)
                ereport(PANIC,
                                (errcode_for_file_access(),
-                                errmsg("could not open xlog file \"%s\": %m", path)));
+                       errmsg("could not open transaction log file \"%s\": %m", path)));
 
        return fd;
 }
@@ -3848,13 +3550,13 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
         * the timelines listed in expectedTLEs.
         *
         * We expect curFileTLI on entry to be the TLI of the preceding file in
-        * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
+        * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
         * to go backwards; this prevents us from picking up the wrong file when a
         * parent timeline extends to higher segment numbers than the child we
         * want to read.
         *
         * If we haven't read the timeline history file yet, read it now, so that
-        * we know which TLIs to scan.  We don't save the list in expectedTLEs,
+        * we know which TLIs to scan.  We don't save the list in expectedTLEs,
         * however, unless we actually find a valid segment.  That way if there is
         * neither a timeline history file nor a WAL segment in the archive, and
         * streaming replication is set up, we'll read the timeline history file
@@ -3918,7 +3620,7 @@ XLogFileClose(void)
 
        /*
         * WAL segment files will not be re-read in normal operation, so we advise
-        * the OS to release any cached pages.  But do not do so if WAL archiving
+        * the OS to release any cached pages.  But do not do so if WAL archiving
         * or streaming is active, because archiver and walsender process could
         * use the cache to read the WAL segment.
         */
@@ -3994,6 +3696,27 @@ CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
        }
 }
 
+/*
+ * Return the last WAL segment removed, or 0 if no segment has been removed
+ * since startup.
+ *
+ * NB: the result can be out of date arbitrarily fast, the caller has to deal
+ * with that.
+ */
+XLogSegNo
+XLogGetLastRemovedSegno(void)
+{
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+       XLogSegNo       lastRemovedSegNo;
+
+       SpinLockAcquire(&xlogctl->info_lck);
+       lastRemovedSegNo = xlogctl->lastRemovedSegNo;
+       SpinLockRelease(&xlogctl->info_lck);
+
+       return lastRemovedSegNo;
+}
+
 /*
  * Update the last removed segno pointer in shared memory, to reflect
  * that the given XLOG file has been removed.
@@ -4063,7 +3786,7 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
        {
                /*
                 * We ignore the timeline part of the XLOG segment identifiers in
-                * deciding whether a segment is still needed.  This ensures that we
+                * deciding whether a segment is still needed.  This ensures that we
                 * won't prematurely remove a segment from a parent timeline. We could
                 * probably be a little more proactive about removing segments of
                 * non-parent timelines, but that would be a whole lot more
@@ -4597,7 +4320,7 @@ rescanLatestTimeLine(void)
  * I/O routines for pg_control
  *
  * *ControlFile is a buffer in shared memory that holds an image of the
- * contents of pg_control.     WriteControlFile() initializes pg_control
+ * contents of pg_control.  WriteControlFile() initializes pg_control
  * given a preloaded buffer, ReadControlFile() loads the buffer from
  * the pg_control file (during postmaster or standalone-backend startup),
  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
@@ -4631,6 +4354,7 @@ WriteControlFile(void)
        ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
 
        ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
+       ControlFile->loblksize = LOBLKSIZE;
 
 #ifdef HAVE_INT64_TIMESTAMP
        ControlFile->enableIntTimes = true;
@@ -4824,6 +4548,13 @@ ReadControlFile(void)
                                " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
                          ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
                                 errhint("It looks like you need to recompile or initdb.")));
+       if (ControlFile->loblksize != LOBLKSIZE)
+               ereport(FATAL,
+                               (errmsg("database files are incompatible with server"),
+                 errdetail("The database cluster was initialized with LOBLKSIZE %d,"
+                                       " but the server was compiled with LOBLKSIZE %d.",
+                                       ControlFile->loblksize, (int) LOBLKSIZE),
+                                errhint("It looks like you need to recompile or initdb.")));
 
 #ifdef HAVE_INT64_TIMESTAMP
        if (ControlFile->enableIntTimes != true)
@@ -4872,6 +4603,10 @@ ReadControlFile(void)
                                  " but the server was compiled without USE_FLOAT8_BYVAL."),
                                 errhint("It looks like you need to recompile or initdb.")));
 #endif
+
+       /* Make the initdb settings visible as GUC variables, too */
+       SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
+                                       PGC_INTERNAL, PGC_S_OVERRIDE);
 }
 
 void
@@ -4998,7 +4733,7 @@ check_wal_buffers(int *newval, void **extra, GucSource source)
        {
                /*
                 * If we haven't yet changed the boot_val default of -1, just let it
-                * be.  We'll fix it when XLOGShmemSize is called.
+                * be.  We'll fix it when XLOGShmemSize is called.
                 */
                if (XLOGbuffers == -1)
                        return true;
@@ -5046,8 +4781,8 @@ XLOGShmemSize(void)
        /* XLogCtl */
        size = sizeof(XLogCtlData);
 
-       /* xlog insertion slots, plus alignment */
-       size = add_size(size, mul_size(sizeof(XLogInsertSlotPadded), num_xloginsert_slots + 1));
+       /* WAL insertion locks, plus alignment */
+       size = add_size(size, mul_size(sizeof(WALInsertLockPadded), num_xloginsert_locks + 1));
        /* xlblocks array */
        size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
        /* extra alignment padding for XLOG I/O buffers */
@@ -5074,7 +4809,8 @@ XLOGShmemInit(void)
 
        ControlFile = (ControlFileData *)
                ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
-       allocptr = ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
+       XLogCtl = (XLogCtlData *)
+               ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
 
        if (foundCFile || foundXLog)
        {
@@ -5082,7 +4818,6 @@ XLOGShmemInit(void)
                Assert(foundCFile && foundXLog);
                return;
        }
-       XLogCtl = (XLogCtlData *) allocptr;
        memset(XLogCtl, 0, sizeof(XLogCtlData));
 
        /*
@@ -5090,21 +4825,37 @@ XLOGShmemInit(void)
         * multiple of the alignment for same, so no extra alignment padding is
         * needed here.
         */
-       allocptr += sizeof(XLogCtlData);
+       allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
        XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
        memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
        allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
 
-       /* Xlog insertion slots. Ensure they're aligned to the full padded size */
-       allocptr += sizeof(XLogInsertSlotPadded) -
-               ((uintptr_t) allocptr) % sizeof(XLogInsertSlotPadded);
-       XLogCtl->Insert.insertSlots = (XLogInsertSlotPadded *) allocptr;
-       allocptr += sizeof(XLogInsertSlotPadded) * num_xloginsert_slots;
+
+       /* WAL insertion locks. Ensure they're aligned to the full padded size */
+       allocptr += sizeof(WALInsertLockPadded) -
+               ((uintptr_t) allocptr) %sizeof(WALInsertLockPadded);
+       WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
+               (WALInsertLockPadded *) allocptr;
+       allocptr += sizeof(WALInsertLockPadded) * num_xloginsert_locks;
+
+       XLogCtl->Insert.WALInsertLockTrancheId = LWLockNewTrancheId();
+
+       XLogCtl->Insert.WALInsertLockTranche.name = "WALInsertLocks";
+       XLogCtl->Insert.WALInsertLockTranche.array_base = WALInsertLocks;
+       XLogCtl->Insert.WALInsertLockTranche.array_stride = sizeof(WALInsertLockPadded);
+
+       LWLockRegisterTranche(XLogCtl->Insert.WALInsertLockTrancheId, &XLogCtl->Insert.WALInsertLockTranche);
+       for (i = 0; i < num_xloginsert_locks; i++)
+       {
+               LWLockInitialize(&WALInsertLocks[i].l.lock,
+                                                XLogCtl->Insert.WALInsertLockTrancheId);
+               WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
+       }
 
        /*
         * Align the start of the page buffers to a full xlog block size boundary.
-        * This simplifies some calculations in XLOG insertion. It is also required
-        * for O_DIRECT.
+        * This simplifies some calculations in XLOG insertion. It is also
+        * required for O_DIRECT.
         */
        allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
        XLogCtl->pages = allocptr;
@@ -5119,19 +4870,6 @@ XLOGShmemInit(void)
        XLogCtl->SharedHotStandbyActive = false;
        XLogCtl->WalWriterSleeping = false;
 
-       for (i = 0; i < num_xloginsert_slots; i++)
-       {
-               XLogInsertSlot *slot = &XLogCtl->Insert.insertSlots[i].slot;
-               SpinLockInit(&slot->mutex);
-               slot->xlogInsertingAt = InvalidXLogRecPtr;
-               slot->owner = NULL;
-
-               slot->releaseOK = true;
-               slot->exclusive = 0;
-               slot->head = NULL;
-               slot->tail = NULL;
-       }
-
        SpinLockInit(&XLogCtl->Insert.insertpos_lck);
        SpinLockInit(&XLogCtl->info_lck);
        SpinLockInit(&XLogCtl->ulsn_lck);
@@ -5169,15 +4907,16 @@ BootStrapXLOG(void)
         * field, as being about as unique as we can easily get.  (Think not to
         * use random(), since it hasn't been seeded and there's no portable way
         * to seed it other than the system clock value...)  The upper half of the
-        * uint64 value is just the tv_sec part, while the lower half is the XOR
-        * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
-        * unnecessarily if "uint64" is really only 32 bits wide.  A person
-        * knowing this encoding can determine the initialization time of the
-        * installation, which could perhaps be useful sometimes.
+        * uint64 value is just the tv_sec part, while the lower half contains the
+        * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
+        * PID for a little extra uniqueness.  A person knowing this encoding can
+        * determine the initialization time of the installation, which could
+        * perhaps be useful sometimes.
         */
        gettimeofday(&tv, NULL);
        sysidentifier = ((uint64) tv.tv_sec) << 32;
-       sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
+       sysidentifier |= ((uint64) tv.tv_usec) << 12;
+       sysidentifier |= getpid() & 0xFFF;
 
        /* First timeline ID is always 1 */
        ThisTimeLineID = 1;
@@ -5288,6 +5027,7 @@ BootStrapXLOG(void)
        ControlFile->max_prepared_xacts = max_prepared_xacts;
        ControlFile->max_locks_per_xact = max_locks_per_xact;
        ControlFile->wal_level = wal_level;
+       ControlFile->wal_log_hints = wal_log_hints;
        ControlFile->data_checksum_version = bootstrap_data_checksum_version;
 
        /* some additional ControlFile fields are set in WriteControlFile() */
@@ -5418,13 +5158,6 @@ readRecoveryCommandFile(void)
                }
                else if (strcmp(item->name, "recovery_target_time") == 0)
                {
-                       /*
-                        * if recovery_target_xid or recovery_target_name specified, then
-                        * this overrides recovery_target_time
-                        */
-                       if (recoveryTarget == RECOVERY_TARGET_XID ||
-                               recoveryTarget == RECOVERY_TARGET_NAME)
-                               continue;
                        recoveryTarget = RECOVERY_TARGET_TIME;
 
                        /*
@@ -5441,12 +5174,6 @@ readRecoveryCommandFile(void)
                }
                else if (strcmp(item->name, "recovery_target_name") == 0)
                {
-                       /*
-                        * if recovery_target_xid specified, then this overrides
-                        * recovery_target_name
-                        */
-                       if (recoveryTarget == RECOVERY_TARGET_XID)
-                               continue;
                        recoveryTarget = RECOVERY_TARGET_NAME;
 
                        recoveryTargetName = pstrdup(item->value);
@@ -5460,6 +5187,19 @@ readRecoveryCommandFile(void)
                                        (errmsg_internal("recovery_target_name = '%s'",
                                                                         recoveryTargetName)));
                }
+               else if (strcmp(item->name, "recovery_target") == 0)
+               {
+                       if (strcmp(item->value, "immediate") == 0)
+                               recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
+                       else
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                errmsg("invalid recovery_target parameter"),
+                                                errhint("The only allowed value is 'immediate'")));
+                       ereport(DEBUG2,
+                                       (errmsg_internal("recovery_target = '%s'",
+                                                                        item->value)));
+               }
                else if (strcmp(item->name, "recovery_target_inclusive") == 0)
                {
                        /*
@@ -5491,6 +5231,14 @@ readRecoveryCommandFile(void)
                                        (errmsg_internal("primary_conninfo = '%s'",
                                                                         PrimaryConnInfo)));
                }
+               else if (strcmp(item->name, "primary_slot_name") == 0)
+               {
+                       ReplicationSlotValidateName(item->value, ERROR);
+                       PrimarySlotName = pstrdup(item->value);
+                       ereport(DEBUG2,
+                                       (errmsg_internal("primary_slot_name = '%s'",
+                                                                        PrimarySlotName)));
+               }
                else if (strcmp(item->name, "trigger_file") == 0)
                {
                        TriggerFile = pstrdup(item->value);
@@ -5498,6 +5246,20 @@ readRecoveryCommandFile(void)
                                        (errmsg_internal("trigger_file = '%s'",
                                                                         TriggerFile)));
                }
+               else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
+               {
+                       const char *hintmsg;
+
+                       if (!parse_int(item->value, &recovery_min_apply_delay, GUC_UNIT_MS,
+                                                  &hintmsg))
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                errmsg("parameter \"%s\" requires a temporal value",
+                                                               "recovery_min_apply_delay"),
+                                                hintmsg ? errhint("%s", _(hintmsg)) : 0));
+                       ereport(DEBUG2,
+                                       (errmsg("recovery_min_apply_delay = '%s'", item->value)));
+               }
                else
                        ereport(FATAL,
                                        (errmsg("unrecognized recovery parameter \"%s\"",
@@ -5528,7 +5290,7 @@ readRecoveryCommandFile(void)
 
        /*
         * If user specified recovery_target_timeline, validate it or compute the
-        * "latest" value.      We can't do this until after we've gotten the restore
+        * "latest" value.  We can't do this until after we've gotten the restore
         * command and set InArchiveRecovery, because we need to fetch timeline
         * history files from the archive.
         */
@@ -5638,74 +5400,82 @@ exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
 }
 
 /*
- * For point-in-time recovery, this function decides whether we want to
- * stop applying the XLOG at or after the current record.
- *
- * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
- * *includeThis is set TRUE if we should apply this record before stopping.
+ * Extract timestamp from WAL record.
  *
- * We also track the timestamp of the latest applied COMMIT/ABORT
- * record in XLogCtl->recoveryLastXTime, for logging purposes.
- * Also, some information is saved in recoveryStopXid et al for use in
- * annotating the new timeline's history file.
+ * If the record contains a timestamp, returns true, and saves the timestamp
+ * in *recordXtime. If the record type has no timestamp, returns false.
+ * Currently, only transaction commit/abort records and restore points contain
+ * timestamps.
  */
 static bool
-recoveryStopsHere(XLogRecord *record, bool *includeThis)
+getRecordTimestamp(XLogRecord *record, TimestampTz *recordXtime)
 {
-       bool            stopsHere;
-       uint8           record_info;
-       TimestampTz recordXtime;
-       char            recordRPName[MAXFNAMELEN];
+       uint8           record_info = record->xl_info & ~XLR_INFO_MASK;
 
-       /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
-       if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
-               return false;
-       record_info = record->xl_info & ~XLR_INFO_MASK;
+       if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
+       {
+               *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
+               return true;
+       }
        if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
        {
-               xl_xact_commit_compact *recordXactCommitData;
-
-               recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
-               recordXtime = recordXactCommitData->xact_time;
+               *recordXtime = ((xl_xact_commit_compact *) XLogRecGetData(record))->xact_time;
+               return true;
        }
-       else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
+       if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
        {
-               xl_xact_commit *recordXactCommitData;
-
-               recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
-               recordXtime = recordXactCommitData->xact_time;
+               *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
+               return true;
        }
-       else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
+       if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
        {
-               xl_xact_abort *recordXactAbortData;
-
-               recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
-               recordXtime = recordXactAbortData->xact_time;
+               *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
+               return true;
        }
-       else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
+       return false;
+}
+
+/*
+ * For point-in-time recovery, this function decides whether we want to
+ * stop applying the XLOG before the current record.
+ *
+ * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
+ * information is saved in recoveryStopXid et al for use in annotating the
+ * new timeline's history file.
+ */
+static bool
+recoveryStopsBefore(XLogRecord *record)
+{
+       bool            stopsHere = false;
+       uint8           record_info;
+       bool            isCommit;
+       TimestampTz recordXtime = 0;
+
+       /* Check if we should stop as soon as reaching consistency */
+       if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
        {
-               xl_restore_point *recordRestorePointData;
+               ereport(LOG,
+                               (errmsg("recovery stopping after reaching consistency")));
 
-               recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
-               recordXtime = recordRestorePointData->rp_time;
-               strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
+               recoveryStopAfter = false;
+               recoveryStopXid = InvalidTransactionId;
+               recoveryStopTime = 0;
+               recoveryStopName[0] = '\0';
+               return true;
        }
-       else
-               return false;
 
-       /* Do we have a PITR target at all? */
-       if (recoveryTarget == RECOVERY_TARGET_UNSET)
-       {
-               /*
-                * Save timestamp of latest transaction commit/abort if this is a
-                * transaction record
-                */
-               if (record->xl_rmid == RM_XACT_ID)
-                       SetLatestXTime(recordXtime);
+       /* Otherwise we only consider stopping before COMMIT or ABORT records. */
+       if (record->xl_rmid != RM_XACT_ID)
+               return false;
+       record_info = record->xl_info & ~XLR_INFO_MASK;
+       if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
+               isCommit = true;
+       else if (record_info == XLOG_XACT_ABORT)
+               isCommit = false;
+       else
                return false;
-       }
 
-       if (recoveryTarget == RECOVERY_TARGET_XID)
+       if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
        {
                /*
                 * There can be only one transaction end record with this exact
@@ -5717,24 +5487,10 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
                 * 50% of the time...
                 */
                stopsHere = (record->xl_xid == recoveryTargetXid);
-               if (stopsHere)
-                       *includeThis = recoveryTargetInclusive;
        }
-       else if (recoveryTarget == RECOVERY_TARGET_NAME)
-       {
-               /*
-                * There can be many restore points that share the same name, so we
-                * stop at the first one
-                */
-               stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);
 
-               /*
-                * Ignore recoveryTargetInclusive because this is not a transaction
-                * record
-                */
-               *includeThis = false;
-       }
-       else
+       if (recoveryTarget == RECOVERY_TARGET_TIME &&
+               getRecordTimestamp(record, &recordXtime))
        {
                /*
                 * There can be many transactions that share the same commit time, so
@@ -5745,64 +5501,132 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
                        stopsHere = (recordXtime > recoveryTargetTime);
                else
                        stopsHere = (recordXtime >= recoveryTargetTime);
-               if (stopsHere)
-                       *includeThis = false;
        }
 
        if (stopsHere)
        {
+               recoveryStopAfter = false;
                recoveryStopXid = record->xl_xid;
                recoveryStopTime = recordXtime;
-               recoveryStopAfter = *includeThis;
+               recoveryStopName[0] = '\0';
 
-               if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
+               if (isCommit)
                {
-                       if (recoveryStopAfter)
-                               ereport(LOG,
-                                               (errmsg("recovery stopping after commit of transaction %u, time %s",
-                                                               recoveryStopXid,
-                                                               timestamptz_to_str(recoveryStopTime))));
-                       else
-                               ereport(LOG,
-                                               (errmsg("recovery stopping before commit of transaction %u, time %s",
-                                                               recoveryStopXid,
-                                                               timestamptz_to_str(recoveryStopTime))));
+                       ereport(LOG,
+                                       (errmsg("recovery stopping before commit of transaction %u, time %s",
+                                                       recoveryStopXid,
+                                                       timestamptz_to_str(recoveryStopTime))));
                }
-               else if (record_info == XLOG_XACT_ABORT)
+               else
                {
-                       if (recoveryStopAfter)
-                               ereport(LOG,
-                                               (errmsg("recovery stopping after abort of transaction %u, time %s",
-                                                               recoveryStopXid,
-                                                               timestamptz_to_str(recoveryStopTime))));
-                       else
-                               ereport(LOG,
-                                               (errmsg("recovery stopping before abort of transaction %u, time %s",
-                                                               recoveryStopXid,
-                                                               timestamptz_to_str(recoveryStopTime))));
+                       ereport(LOG,
+                                       (errmsg("recovery stopping before abort of transaction %u, time %s",
+                                                       recoveryStopXid,
+                                                       timestamptz_to_str(recoveryStopTime))));
                }
-               else
+       }
+
+       return stopsHere;
+}
+
+/*
+ * Same as recoveryStopsBefore, but called after applying the record.
+ *
+ * We also track the timestamp of the latest applied COMMIT/ABORT
+ * record in XLogCtl->recoveryLastXTime.
+ */
+static bool
+recoveryStopsAfter(XLogRecord *record)
+{
+       uint8           record_info;
+       TimestampTz recordXtime;
+
+       record_info = record->xl_info & ~XLR_INFO_MASK;
+
+       /*
+        * There can be many restore points that share the same name; we stop at
+        * the first one.
+        */
+       if (recoveryTarget == RECOVERY_TARGET_NAME &&
+               record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
+       {
+               xl_restore_point *recordRestorePointData;
+
+               recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
+
+               if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
                {
-                       strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);
+                       recoveryStopAfter = true;
+                       recoveryStopXid = InvalidTransactionId;
+                       (void) getRecordTimestamp(record, &recoveryStopTime);
+                       strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
 
                        ereport(LOG,
                                (errmsg("recovery stopping at restore point \"%s\", time %s",
                                                recoveryStopName,
                                                timestamptz_to_str(recoveryStopTime))));
+                       return true;
                }
+       }
+
+       if (record->xl_rmid == RM_XACT_ID &&
+               (record_info == XLOG_XACT_COMMIT_COMPACT ||
+                record_info == XLOG_XACT_COMMIT ||
+                record_info == XLOG_XACT_ABORT))
+       {
+               /* Update the last applied transaction timestamp */
+               if (getRecordTimestamp(record, &recordXtime))
+                       SetLatestXTime(recordXtime);
 
                /*
-                * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
-                * restore point since they are timestamped, though the latest
-                * transaction time is not updated.
+                * There can be only one transaction end record with this exact
+                * transactionid
+                *
+                * when testing for an xid, we MUST test for equality only, since
+                * transactions are numbered in the order they start, not the order
+                * they complete. A higher numbered xid will complete before you about
+                * 50% of the time...
                 */
-               if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
-                       SetLatestXTime(recordXtime);
+               if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
+                       record->xl_xid == recoveryTargetXid)
+               {
+                       recoveryStopAfter = true;
+                       recoveryStopXid = record->xl_xid;
+                       recoveryStopTime = recordXtime;
+                       recoveryStopName[0] = '\0';
+
+                       if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
+                       {
+                               ereport(LOG,
+                                               (errmsg("recovery stopping after commit of transaction %u, time %s",
+                                                               recoveryStopXid,
+                                                               timestamptz_to_str(recoveryStopTime))));
+                       }
+                       else if (record_info == XLOG_XACT_ABORT)
+                       {
+                               ereport(LOG,
+                                               (errmsg("recovery stopping after abort of transaction %u, time %s",
+                                                               recoveryStopXid,
+                                                               timestamptz_to_str(recoveryStopTime))));
+                       }
+                       return true;
+               }
+       }
+
+       /* Check if we should stop as soon as reaching consistency */
+       if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
+       {
+               ereport(LOG,
+                               (errmsg("recovery stopping after reaching consistency")));
+
+               recoveryStopAfter = true;
+               recoveryStopXid = InvalidTransactionId;
+               recoveryStopTime = 0;
+               recoveryStopName[0] = '\0';
+               return true;
        }
-       else if (record->xl_rmid == RM_XACT_ID)
-               SetLatestXTime(recordXtime);
 
-       return stopsHere;
+       return false;
 }
 
 /*
@@ -5855,6 +5679,90 @@ SetRecoveryPause(bool recoveryPause)
        SpinLockRelease(&xlogctl->info_lck);
 }
 
+/*
+ * When recovery_min_apply_delay is set, we wait long enough to make sure
+ * certain record types are applied at least that interval behind the master.
+ *
+ * Returns true if we waited.
+ *
+ * Note that the delay is calculated between the WAL record log time and
+ * the current time on standby. We would prefer to keep track of when this
+ * standby received each WAL record, which would allow a more consistent
+ * approach and one not affected by time synchronisation issues, but that
+ * is significantly more effort and complexity for little actual gain in
+ * usability.
+ */
+static bool
+recoveryApplyDelay(XLogRecord *record)
+{
+       uint8           record_info;
+       TimestampTz xtime;
+       long            secs;
+       int                     microsecs;
+
+       /* nothing to do if no delay configured */
+       if (recovery_min_apply_delay == 0)
+               return false;
+
+       /*
+        * Is it a COMMIT record?
+        *
+        * We deliberately choose not to delay aborts since they have no effect on
+        * MVCC. We already allow replay of records that don't have a timestamp,
+        * so there is already opportunity for issues caused by early conflicts on
+        * standbys.
+        */
+       record_info = record->xl_info & ~XLR_INFO_MASK;
+       if (!(record->xl_rmid == RM_XACT_ID &&
+                 (record_info == XLOG_XACT_COMMIT_COMPACT ||
+                  record_info == XLOG_XACT_COMMIT)))
+               return false;
+
+       if (!getRecordTimestamp(record, &xtime))
+               return false;
+
+       recoveryDelayUntilTime =
+               TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
+
+       /*
+        * Exit without arming the latch if it's already past time to apply this
+        * record
+        */
+       TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
+                                               &secs, &microsecs);
+       if (secs <= 0 && microsecs <= 0)
+               return false;
+
+       while (true)
+       {
+               ResetLatch(&XLogCtl->recoveryWakeupLatch);
+
+               /* might change the trigger file's location */
+               HandleStartupProcInterrupts();
+
+               if (CheckForStandbyTrigger())
+                       break;
+
+               /*
+                * Wait for difference between GetCurrentTimestamp() and
+                * recoveryDelayUntilTime
+                */
+               TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
+                                                       &secs, &microsecs);
+
+               if (secs <= 0 && microsecs <= 0)
+                       break;
+
+               elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
+                        secs, microsecs / 1000);
+
+               WaitLatch(&XLogCtl->recoveryWakeupLatch,
+                                 WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                                 secs * 1000L + microsecs / 1000);
+       }
+       return true;
+}
+
 /*
  * Save timestamp of latest processed commit/abort record.
  *
@@ -5970,7 +5878,7 @@ CheckRequiredParameterValues(void)
         * For archive recovery, the WAL must be generated with at least 'archive'
         * wal_level.
         */
-       if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
+       if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
        {
                ereport(WARNING,
                                (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
@@ -5981,11 +5889,11 @@ CheckRequiredParameterValues(void)
         * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
         * we must have at least as many backend slots as the primary.
         */
-       if (InArchiveRecovery && EnableHotStandby)
+       if (ArchiveRecoveryRequested && EnableHotStandby)
        {
                if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
                        ereport(ERROR,
-                                       (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
+                                       (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" or higher on the master server"),
                                         errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
 
                /* We ignore autovacuum_max_workers when we make this test. */
@@ -6021,12 +5929,10 @@ StartupXLOG(void)
        XLogSegNo       endLogSegNo;
        TimeLineID      PrevTimeLineID;
        XLogRecord *record;
-       uint32          freespace;
        TransactionId oldestActiveXID;
        bool            backupEndRequired = false;
        bool            backupFromStandby = false;
        DBState         dbstate_at_startup;
-       int                     firstIdx;
        XLogReaderState *xlogreader;
        XLogPageReadPrivate private;
        bool            fast_promoted = false;
@@ -6091,7 +5997,7 @@ StartupXLOG(void)
        ValidateXLOGDirectoryStructure();
 
        /*
-        * Clear out any old relcache cache files.      This is *necessary* if we do
+        * Clear out any old relcache cache files.  This is *necessary* if we do
         * any WAL replay, since that would probably result in the cache files
         * being out of sync with database reality.  In theory we could leave them
         * in place if the database had been cleanly shut down, but it seems
@@ -6120,7 +6026,7 @@ StartupXLOG(void)
         * Save archive_cleanup_command in shared memory so that other processes
         * can see it.
         */
-       strncpy(XLogCtl->archiveCleanupCommand,
+       strlcpy(XLogCtl->archiveCleanupCommand,
                        archiveCleanupCommand ? archiveCleanupCommand : "",
                        sizeof(XLogCtl->archiveCleanupCommand));
 
@@ -6141,6 +6047,9 @@ StartupXLOG(void)
                        ereport(LOG,
                                        (errmsg("starting point-in-time recovery to \"%s\"",
                                                        recoveryTargetName)));
+               else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
+                       ereport(LOG,
+                                       (errmsg("starting point-in-time recovery to earliest consistent point")));
                else
                        ereport(LOG,
                                        (errmsg("starting archive recovery")));
@@ -6160,7 +6069,7 @@ StartupXLOG(void)
                ereport(ERROR,
                                (errcode(ERRCODE_OUT_OF_MEMORY),
                                 errmsg("out of memory"),
-                       errdetail("Failed while allocating an XLog reading processor")));
+                  errdetail("Failed while allocating an XLog reading processor.")));
        xlogreader->system_identifier = ControlFile->system_identifier;
 
        if (read_backup_label(&checkPointLoc, &backupEndRequired,
@@ -6303,7 +6212,7 @@ StartupXLOG(void)
                ereport(FATAL,
                                (errmsg("requested timeline %u is not a child of this server's history",
                                                recoveryTargetTLI),
-                                errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X",
+                                errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
                                                   (uint32) (ControlFile->checkPoint >> 32),
                                                   (uint32) ControlFile->checkPoint,
                                                   ControlFile->checkPointCopy.ThisTimeLineID,
@@ -6355,9 +6264,30 @@ StartupXLOG(void)
        MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
        SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
        SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
+       MultiXactSetSafeTruncate(checkPoint.oldestMulti);
        XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
        XLogCtl->ckptXid = checkPoint.nextXid;
 
+       /*
+        * Initialize replication slots, before there's a chance to remove
+        * required resources.
+        */
+       StartupReplicationSlots();
+
+       /*
+        * Startup logical state, needs to be setup now so we have proper data
+        * during crash recovery.
+        */
+       StartupReorderBuffer();
+
+       /*
+        * Startup MultiXact.  We need to do this early for two reasons: one is
+        * that we might try to access multixacts when we do tuple freezing, and
+        * the other is we need its state initialized because we attempt
+        * truncation during restartpoints.
+        */
+       StartupMultiXact();
+
        /*
         * Initialize unlogged LSN. On a clean shutdown, it's restored from the
         * control file. On recovery, all unlogged relations are blown away, so
@@ -6558,8 +6488,9 @@ StartupXLOG(void)
                        ProcArrayInitRecovery(ShmemVariableCache->nextXid);
 
                        /*
-                        * Startup commit log and subtrans only. Other SLRUs are not
-                        * maintained during recovery and need not be started yet.
+                        * Startup commit log and subtrans only. MultiXact has already
+                        * been started up and other SLRUs are not maintained during
+                        * recovery and need not be started yet.
                         */
                        StartupCLOG();
                        StartupSUBTRANS(oldestActiveXID);
@@ -6606,22 +6537,18 @@ StartupXLOG(void)
                }
 
                /*
-                * Initialize shared replayEndRecPtr, lastReplayedEndRecPtr, and
-                * recoveryLastXTime.
-                *
-                * This is slightly confusing if we're starting from an online
-                * checkpoint; we've just read and replayed the checkpoint record, but
-                * we're going to start replay from its redo pointer, which precedes
-                * the location of the checkpoint record itself. So even though the
-                * last record we've replayed is indeed ReadRecPtr, we haven't
-                * replayed all the preceding records yet. That's OK for the current
-                * use of these variables.
+                * Initialize shared variables for tracking progress of WAL replay, as
+                * if we had just replayed the record before the REDO location (or the
+                * checkpoint record itself, if it's a shutdown checkpoint).
                 */
                SpinLockAcquire(&xlogctl->info_lck);
-               xlogctl->replayEndRecPtr = ReadRecPtr;
+               if (checkPoint.redo < RecPtr)
+                       xlogctl->replayEndRecPtr = checkPoint.redo;
+               else
+                       xlogctl->replayEndRecPtr = EndRecPtr;
                xlogctl->replayEndTLI = ThisTimeLineID;
-               xlogctl->lastReplayedEndRecPtr = EndRecPtr;
-               xlogctl->lastReplayedTLI = ThisTimeLineID;
+               xlogctl->lastReplayedEndRecPtr = xlogctl->replayEndRecPtr;
+               xlogctl->lastReplayedTLI = xlogctl->replayEndTLI;
                xlogctl->recoveryLastXTime = 0;
                xlogctl->currentChunkStartTime = 0;
                xlogctl->recoveryPause = false;
@@ -6673,8 +6600,6 @@ StartupXLOG(void)
 
                if (record != NULL)
                {
-                       bool            recoveryContinue = true;
-                       bool            recoveryApply = true;
                        ErrorContextCallback errcallback;
                        TimestampTz xtime;
 
@@ -6703,10 +6628,8 @@ StartupXLOG(void)
                                                        (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
                                                         (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
                                        xlog_outrec(&buf, record);
-                                       appendStringInfo(&buf, " - ");
-                                       RmgrTable[record->xl_rmid].rm_desc(&buf,
-                                                                                                          record->xl_info,
-                                                                                                        XLogRecGetData(record));
+                                       appendStringInfoString(&buf, " - ");
+                                       RmgrTable[record->xl_rmid].rm_desc(&buf, record);
                                        elog(LOG, "%s", buf.data);
                                        pfree(buf.data);
                                }
@@ -6734,19 +6657,27 @@ StartupXLOG(void)
                                /*
                                 * Have we reached our recovery target?
                                 */
-                               if (recoveryStopsHere(record, &recoveryApply))
+                               if (recoveryStopsBefore(record))
                                {
-                                       if (recoveryPauseAtTarget)
-                                       {
-                                               SetRecoveryPause(true);
-                                               recoveryPausesHere();
-                                       }
                                        reachedStopPoint = true;        /* see below */
-                                       recoveryContinue = false;
+                                       break;
+                               }
 
-                                       /* Exit loop if we reached non-inclusive recovery target */
-                                       if (!recoveryApply)
-                                               break;
+                               /*
+                                * If we've been asked to lag the master, wait on latch until
+                                * enough time has passed.
+                                */
+                               if (recoveryApplyDelay(record))
+                               {
+                                       /*
+                                        * We test for paused recovery again here. If user sets
+                                        * delayed apply, it may be because they expect to pause
+                                        * recovery in case of problems, so we must test again
+                                        * here otherwise pausing during the delay-wait wouldn't
+                                        * work.
+                                        */
+                                       if (xlogctl->recoveryPause)
+                                               recoveryPausesHere();
                                }
 
                                /* Setup error traceback support for ereport() */
@@ -6860,8 +6791,11 @@ StartupXLOG(void)
                                        WalSndWakeup();
 
                                /* Exit loop if we reached inclusive recovery target */
-                               if (!recoveryContinue)
+                               if (recoveryStopsAfter(record))
+                               {
+                                       reachedStopPoint = true;
                                        break;
+                               }
 
                                /* Else, try to fetch the next WAL record */
                                record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
@@ -6871,6 +6805,19 @@ StartupXLOG(void)
                         * end of main redo apply loop
                         */
 
+                       if (recoveryPauseAtTarget && reachedStopPoint)
+                       {
+                               SetRecoveryPause(true);
+                               recoveryPausesHere();
+                       }
+
+                       /* Allow resource managers to do any required cleanup. */
+                       for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
+                       {
+                               if (RmgrTable[rmid].rm_cleanup != NULL)
+                                       RmgrTable[rmid].rm_cleanup();
+                       }
+
                        ereport(LOG,
                                        (errmsg("redo done at %X/%X",
                                                 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
@@ -6964,8 +6911,8 @@ StartupXLOG(void)
        /*
         * Consider whether we need to assign a new timeline ID.
         *
-        * If we are doing an archive recovery, we always assign a new ID.      This
-        * handles a couple of issues.  If we stopped short of the end of WAL
+        * If we are doing an archive recovery, we always assign a new ID.  This
+        * handles a couple of issues.  If we stopped short of the end of WAL
         * during recovery, then we are clearly generating a new timeline and must
         * assign it a unique new ID.  Even if we ran to the end, modifying the
         * current last segment is problematic because it may result in trying to
@@ -7004,6 +6951,8 @@ StartupXLOG(void)
                        snprintf(reason, sizeof(reason),
                                         "at restore point \"%s\"",
                                         recoveryStopName);
+               else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
+                       snprintf(reason, sizeof(reason), "reached consistency");
                else
                        snprintf(reason, sizeof(reason), "no recovery target specified");
 
@@ -7034,48 +6983,51 @@ StartupXLOG(void)
        openLogOff = 0;
        Insert = &XLogCtl->Insert;
        Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
-
-       firstIdx = XLogRecEndPtrToBufIdx(EndOfLog);
-       XLogCtl->curridx = firstIdx;
-
-       XLogCtl->xlblocks[firstIdx] = ((EndOfLog - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
+       Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
 
        /*
         * Tricky point here: readBuf contains the *last* block that the LastRec
-        * record spans, not the one it starts in.      The last block is indeed the
+        * record spans, not the one it starts in.  The last block is indeed the
         * one we want to use.
         */
-       Assert(readOff == (XLogCtl->xlblocks[firstIdx] - XLOG_BLCKSZ) % XLogSegSize);
-       memcpy((char *) &XLogCtl->pages[firstIdx * XLOG_BLCKSZ], xlogreader->readBuf, XLOG_BLCKSZ);
-       Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
+       if (EndOfLog % XLOG_BLCKSZ != 0)
+       {
+               char       *page;
+               int                     len;
+               int                     firstIdx;
+               XLogRecPtr      pageBeginPtr;
 
-       LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
+               pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
+               Assert(readOff == pageBeginPtr % XLogSegSize);
 
-       XLogCtl->LogwrtResult = LogwrtResult;
+               firstIdx = XLogRecPtrToBufIdx(EndOfLog);
 
-       XLogCtl->LogwrtRqst.Write = EndOfLog;
-       XLogCtl->LogwrtRqst.Flush = EndOfLog;
+               /* Copy the valid part of the last block, and zero the rest */
+               page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
+               len = EndOfLog % XLOG_BLCKSZ;
+               memcpy(page, xlogreader->readBuf, len);
+               memset(page + len, 0, XLOG_BLCKSZ - len);
 
-       freespace = INSERT_FREESPACE(EndOfLog);
-       if (freespace > 0)
-       {
-               /* Make sure rest of page is zero */
-               MemSet(&XLogCtl->pages[firstIdx * XLOG_BLCKSZ] + EndOfLog % XLOG_BLCKSZ, 0, freespace);
-               XLogCtl->Write.curridx = firstIdx;
+               XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
+               XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
        }
        else
        {
                /*
-                * Whenever LogwrtResult points to exactly the end of a page,
-                * Write.curridx must point to the *next* page (see XLogWrite()).
-                *
-                * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
-                * this is sufficient.  The first actual attempt to insert a log
-                * record will advance the insert state.
+                * There is no partial block to copy. Just set InitializedUpTo, and
+                * let the first attempt to insert a log record to initialize the next
+                * buffer.
                 */
-               XLogCtl->Write.curridx = NextBufIdx(firstIdx);
+               XLogCtl->InitializedUpTo = EndOfLog;
        }
 
+       LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
+
+       XLogCtl->LogwrtResult = LogwrtResult;
+
+       XLogCtl->LogwrtRqst.Write = EndOfLog;
+       XLogCtl->LogwrtRqst.Flush = EndOfLog;
+
        /* Pre-scan prepared transactions to find out the range of XIDs present */
        oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
 
@@ -7091,27 +7043,6 @@ StartupXLOG(void)
 
        if (InRecovery)
        {
-               int                     rmid;
-
-               /*
-                * Resource managers might need to write WAL records, eg, to record
-                * index cleanup actions.  So temporarily enable XLogInsertAllowed in
-                * this process only.
-                */
-               LocalSetXLogInsertAllowed();
-
-               /*
-                * Allow resource managers to do any required cleanup.
-                */
-               for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
-               {
-                       if (RmgrTable[rmid].rm_cleanup != NULL)
-                               RmgrTable[rmid].rm_cleanup();
-               }
-
-               /* Disallow XLogInsert again */
-               LocalXLogInsertAllowed = -1;
-
                /*
                 * Perform a checkpoint to update all our recovery activity to disk.
                 *
@@ -7199,7 +7130,7 @@ StartupXLOG(void)
        LWLockRelease(ControlFileLock);
 
        /* start the archive_timeout timer running */
-       XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
+       XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
 
        /* also initialize latestCompletedXid, to nextXid - 1 */
        LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
@@ -7220,8 +7151,8 @@ StartupXLOG(void)
        /*
         * Perform end of recovery actions for any SLRUs that need it.
         */
-       StartupMultiXact();
        TrimCLOG();
+       TrimMultiXact();
 
        /* Reload shared-memory state for prepared transactions */
        RecoverPreparedTransactions();
@@ -7249,7 +7180,7 @@ StartupXLOG(void)
        XLogReportParameters();
 
        /*
-        * All done.  Allow backends to write WAL.      (Although the bool flag is
+        * All done.  Allow backends to write WAL.  (Although the bool flag is
         * probably atomic in itself, we use the info_lck here to ensure that
         * there are no race conditions concerning visibility of other recent
         * updates to shared memory.)
@@ -7287,6 +7218,8 @@ StartupXLOG(void)
 static void
 CheckRecoveryConsistency(void)
 {
+       XLogRecPtr      lastReplayedEndRecPtr;
+
        /*
         * During crash recovery, we don't reach a consistent state until we've
         * replayed all the WAL.
@@ -7294,11 +7227,17 @@ CheckRecoveryConsistency(void)
        if (XLogRecPtrIsInvalid(minRecoveryPoint))
                return;
 
+       /*
+        * assume that we are called in the startup process, and hence don't need
+        * a lock to read lastReplayedEndRecPtr
+        */
+       lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
+
        /*
         * Have we reached the point where our base backup was completed?
         */
        if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
-               ControlFile->backupEndPoint <= EndRecPtr)
+               ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
        {
                /*
                 * We have reached the end of base backup, as indicated by pg_control.
@@ -7311,8 +7250,8 @@ CheckRecoveryConsistency(void)
 
                LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 
-               if (ControlFile->minRecoveryPoint < EndRecPtr)
-                       ControlFile->minRecoveryPoint = EndRecPtr;
+               if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
+                       ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
 
                ControlFile->backupStartPoint = InvalidXLogRecPtr;
                ControlFile->backupEndPoint = InvalidXLogRecPtr;
@@ -7330,7 +7269,7 @@ CheckRecoveryConsistency(void)
         * consistent yet.
         */
        if (!reachedConsistency && !ControlFile->backupEndRequired &&
-               minRecoveryPoint <= XLogCtl->lastReplayedEndRecPtr &&
+               minRecoveryPoint <= lastReplayedEndRecPtr &&
                XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
        {
                /*
@@ -7342,8 +7281,8 @@ CheckRecoveryConsistency(void)
                reachedConsistency = true;
                ereport(LOG,
                                (errmsg("consistent recovery state reached at %X/%X",
-                                               (uint32) (XLogCtl->lastReplayedEndRecPtr >> 32),
-                                               (uint32) XLogCtl->lastReplayedEndRecPtr)));
+                                               (uint32) (lastReplayedEndRecPtr >> 32),
+                                               (uint32) lastReplayedEndRecPtr)));
        }
 
        /*
@@ -7390,22 +7329,36 @@ RecoveryInProgress(void)
                return false;
        else
        {
-               /* use volatile pointer to prevent code rearrangement */
+               /*
+                * use volatile pointer to make sure we make a fresh read of the
+                * shared variable.
+                */
                volatile XLogCtlData *xlogctl = XLogCtl;
 
-               /* spinlock is essential on machines with weak memory ordering! */
-               SpinLockAcquire(&xlogctl->info_lck);
                LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
-               SpinLockRelease(&xlogctl->info_lck);
 
                /*
                 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
                 * is finished. InitPostgres() relies upon this behaviour to ensure
-                * that InitXLOGAccess() is called at backend startup.  (If you change
+                * that InitXLOGAccess() is called at backend startup.  (If you change
                 * this, see also LocalSetXLogInsertAllowed.)
                 */
                if (!LocalRecoveryInProgress)
+               {
+                       /*
+                        * If we just exited recovery, make sure we read TimeLineID and
+                        * RedoRecPtr after SharedRecoveryInProgress (for machines with
+                        * weak memory ordering).
+                        */
+                       pg_memory_barrier();
                        InitXLOGAccess();
+               }
+
+               /*
+                * Note: We don't need a memory barrier when we're still in recovery.
+                * We might exit recovery immediately after return, so the caller
+                * can't rely on 'true' meaning that we're still in recovery anyway.
+                */
 
                return LocalRecoveryInProgress;
        }
@@ -7417,7 +7370,8 @@ RecoveryInProgress(void)
  * true. Postmaster knows this by way of signal, not via shared memory.
  *
  * Unlike testing standbyState, this works in any process that's connected to
- * shared memory.
+ * shared memory.  (And note that standbyState alone doesn't tell the truth
+ * anyway.)
  */
 bool
 HotStandbyActive(void)
@@ -7443,6 +7397,17 @@ HotStandbyActive(void)
        }
 }
 
+/*
+ * Like HotStandbyActive(), but to be used only in WAL replay code,
+ * where we don't need to ask any other process what the state is.
+ */
+bool
+HotStandbyActiveInReplay(void)
+{
+       Assert(AmStartupProcess());
+       return LocalHotStandbyActive;
+}
+
 /*
  * Is this process allowed to insert new WAL records?
  *
@@ -7629,6 +7594,11 @@ InitXLOGAccess(void)
        ThisTimeLineID = XLogCtl->ThisTimeLineID;
        Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
 
+       /* Initialize our copy of WALInsertLocks and register the tranche */
+       WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
+       LWLockRegisterTranche(XLogCtl->Insert.WALInsertLockTrancheId,
+                                                 &XLogCtl->Insert.WALInsertLockTranche);
+
        /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
        (void) GetRedoRecPtr();
 }
@@ -7643,11 +7613,11 @@ GetRedoRecPtr(void)
 {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;
-       XLogRecPtr ptr;
+       XLogRecPtr      ptr;
 
        /*
         * The possibly not up-to-date copy in XlogCtl is enough. Even if we
-        * grabbed a WAL insertion slot to read the master copy, someone might
+        * grabbed a WAL insertion lock to read the master copy, someone might
         * update it just after we've released the lock.
         */
        SpinLockAcquire(&xlogctl->info_lck);
@@ -7665,7 +7635,7 @@ GetRedoRecPtr(void)
  *
  * NOTE: The value *actually* returned is the position of the last full
  * xlog page. It lags behind the real insert position by at most 1 page.
- * For that, we don't need to scan through WAL insertion slots, and an
+ * For that, we don't need to scan through WAL insertion locks, and an
  * approximation is enough for the current usage of this function.
  */
 XLogRecPtr
@@ -7710,7 +7680,7 @@ GetLastSegSwitchTime(void)
 
        /* Need WALWriteLock, but shared lock is sufficient */
        LWLockAcquire(WALWriteLock, LW_SHARED);
-       result = XLogCtl->Write.lastSegSwitchTime;
+       result = XLogCtl->lastSegSwitchTime;
        LWLockRelease(WALWriteLock);
 
        return result;
@@ -8026,13 +7996,13 @@ CreateCheckPoint(int flags)
         * We must block concurrent insertions while examining insert state to
         * determine the checkpoint REDO pointer.
         */
-       WALInsertSlotAcquire(true);
+       WALInsertLockAcquireExclusive();
        curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
 
        /*
         * If this isn't a shutdown or forced checkpoint, and we have not inserted
         * any XLOG records since the start of the last checkpoint, skip the
-        * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
+        * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
         * when the system is idle. That wastes log space, and more importantly it
         * exposes us to possible loss of both current and previous checkpoint
         * records if the machine crashes just as we're writing the update.
@@ -8051,7 +8021,7 @@ CreateCheckPoint(int flags)
                        MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
                        ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
                {
-                       WALInsertSlotRelease();
+                       WALInsertLockRelease();
                        LWLockRelease(CheckpointLock);
                        END_CRIT_SECTION();
                        return;
@@ -8095,7 +8065,7 @@ CreateCheckPoint(int flags)
 
        /*
         * Here we update the shared RedoRecPtr for future XLogInsert calls; this
-        * must be done while holding the insertion slots.
+        * must be done while holding all the insertion locks.
         *
         * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
         * pointing past where it really needs to point.  This is okay; the only
@@ -8107,10 +8077,10 @@ CreateCheckPoint(int flags)
        RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
 
        /*
-        * Now we can release the WAL insertion slots, allowing other xacts to
+        * Now we can release the WAL insertion locks, allowing other xacts to
         * proceed while we are flushing disk buffers.
         */
-       WALInsertSlotRelease();
+       WALInsertLockRelease();
 
        /* Update the info_lck-protected copy of RedoRecPtr as well */
        SpinLockAcquire(&xlogctl->info_lck);
@@ -8126,46 +8096,6 @@ CreateCheckPoint(int flags)
 
        TRACE_POSTGRESQL_CHECKPOINT_START(flags);
 
-       /*
-        * In some cases there are groups of actions that must all occur on one
-        * side or the other of a checkpoint record. Before flushing the
-        * checkpoint record we must explicitly wait for any backend currently
-        * performing those groups of actions.
-        *
-        * One example is end of transaction, so we must wait for any transactions
-        * that are currently in commit critical sections.      If an xact inserted
-        * its commit record into XLOG just before the REDO point, then a crash
-        * restart from the REDO point would not replay that record, which means
-        * that our flushing had better include the xact's update of pg_clog.  So
-        * we wait till he's out of his commit critical section before proceeding.
-        * See notes in RecordTransactionCommit().
-        *
-        * Because we've already released the insertion slots, this test is a bit
-        * fuzzy: it is possible that we will wait for xacts we didn't really need
-        * to wait for.  But the delay should be short and it seems better to make
-        * checkpoint take a bit longer than to hold off insertions longer than
-        * necessary.
-        * (In fact, the whole reason we have this issue is that xact.c does
-        * commit record XLOG insertion and clog update as two separate steps
-        * protected by different locks, but again that seems best on grounds of
-        * minimizing lock contention.)
-        *
-        * A transaction that has not yet set delayChkpt when we look cannot be at
-        * risk, since he's not inserted his commit record yet; and one that's
-        * already cleared it is not at risk either, since he's done fixing clog
-        * and we will correctly flush the update below.  So we cannot miss any
-        * xacts we need to wait for.
-        */
-       vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
-       if (nvxids > 0)
-       {
-               do
-               {
-                       pg_usleep(10000L);      /* wait for 10 msec */
-               } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
-       }
-       pfree(vxids);
-
        /*
         * Get the other info we need for the checkpoint record.
         */
@@ -8202,6 +8132,45 @@ CreateCheckPoint(int flags)
         */
        END_CRIT_SECTION();
 
+       /*
+        * In some cases there are groups of actions that must all occur on one
+        * side or the other of a checkpoint record. Before flushing the
+        * checkpoint record we must explicitly wait for any backend currently
+        * performing those groups of actions.
+        *
+        * One example is end of transaction, so we must wait for any transactions
+        * that are currently in commit critical sections.  If an xact inserted
+        * its commit record into XLOG just before the REDO point, then a crash
+        * restart from the REDO point would not replay that record, which means
+        * that our flushing had better include the xact's update of pg_clog.  So
+        * we wait till he's out of his commit critical section before proceeding.
+        * See notes in RecordTransactionCommit().
+        *
+        * Because we've already released the insertion locks, this test is a bit
+        * fuzzy: it is possible that we will wait for xacts we didn't really need
+        * to wait for.  But the delay should be short and it seems better to make
+        * checkpoint take a bit longer than to hold off insertions longer than
+        * necessary. (In fact, the whole reason we have this issue is that xact.c
+        * does commit record XLOG insertion and clog update as two separate steps
+        * protected by different locks, but again that seems best on grounds of
+        * minimizing lock contention.)
+        *
+        * A transaction that has not yet set delayChkpt when we look cannot be at
+        * risk, since he's not inserted his commit record yet; and one that's
+        * already cleared it is not at risk either, since he's done fixing clog
+        * and we will correctly flush the update below.  So we cannot miss any
+        * xacts we need to wait for.
+        */
+       vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
+       if (nvxids > 0)
+       {
+               do
+               {
+                       pg_usleep(10000L);      /* wait for 10 msec */
+               } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
+       }
+       pfree(vxids);
+
        CheckPointGuts(checkPoint.redo, flags);
 
        /*
@@ -8304,6 +8273,12 @@ CreateCheckPoint(int flags)
         */
        END_CRIT_SECTION();
 
+       /*
+        * Now that the checkpoint is safely on disk, we can update the point to
+        * which multixact can be truncated.
+        */
+       MultiXactSetSafeTruncate(checkPoint.oldestMulti);
+
        /*
         * Let smgr do post-checkpoint cleanup (eg, deleting old files).
         */
@@ -8329,13 +8304,18 @@ CreateCheckPoint(int flags)
 
        /*
         * Truncate pg_subtrans if possible.  We can throw away all data before
-        * the oldest XMIN of any running transaction.  No future transaction will
+        * the oldest XMIN of any running transaction.  No future transaction will
         * attempt to reference any pg_subtrans entry older than that (see Asserts
-        * in subtrans.c).      During recovery, though, we mustn't do this because
+        * in subtrans.c).  During recovery, though, we mustn't do this because
         * StartupSUBTRANS hasn't been called yet.
         */
        if (!RecoveryInProgress())
-               TruncateSUBTRANS(GetOldestXmin(true, false));
+               TruncateSUBTRANS(GetOldestXmin(NULL, false));
+
+       /*
+        * Truncate pg_multixact too.
+        */
+       TruncateMultiXact();
 
        /* Real work is done, but log and update stats before releasing lock. */
        LogCheckpointEnd(false);
@@ -8358,7 +8338,7 @@ CreateCheckPoint(int flags)
  * CreateRestartPoint() allows for the case where recovery may end before
  * the restartpoint completes so there is no concern of concurrent behaviour.
  */
-void
+static void
 CreateEndOfRecoveryRecord(void)
 {
        xl_end_of_recovery xlrec;
@@ -8371,10 +8351,10 @@ CreateEndOfRecoveryRecord(void)
 
        xlrec.end_time = time(NULL);
 
-       WALInsertSlotAcquire(true);
+       WALInsertLockAcquireExclusive();
        xlrec.ThisTimeLineID = ThisTimeLineID;
        xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
-       WALInsertSlotRelease();
+       WALInsertLockRelease();
 
        LocalSetXLogInsertAllowed();
 
@@ -8419,6 +8399,9 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
        CheckPointMultiXact();
        CheckPointPredicate();
        CheckPointRelationMap();
+       CheckPointReplicationSlots();
+       CheckPointSnapBuild();
+       CheckPointLogicalRewriteHeap();
        CheckPointBuffers(flags);       /* performs all required fsyncs */
        /* We deliberately delay 2PC checkpointing as long as possible */
        CheckPointTwoPhase(checkPointRedo);
@@ -8437,31 +8420,9 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 static void
 RecoveryRestartPoint(const CheckPoint *checkPoint)
 {
-       int                     rmid;
-
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;
 
-       /*
-        * Is it safe to restartpoint?  We must ask each of the resource managers
-        * whether they have any partial state information that might prevent a
-        * correct restart from this point.  If so, we skip this opportunity, but
-        * return at the next checkpoint record for another try.
-        */
-       for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
-       {
-               if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
-                       if (!(RmgrTable[rmid].rm_safe_restartpoint()))
-                       {
-                               elog(trace_recovery(DEBUG2),
-                                        "RM %d not safe to record restart point at %X/%X",
-                                        rmid,
-                                        (uint32) (checkPoint->redo >> 32),
-                                        (uint32) checkPoint->redo);
-                               return;
-                       }
-       }
-
        /*
         * Also refrain from creating a restartpoint if we have seen any
         * references to non-existent pages. Restarting recovery from the
@@ -8579,9 +8540,9 @@ CreateRestartPoint(int flags)
         * during recovery this is just pro forma, because no WAL insertions are
         * happening.
         */
-       WALInsertSlotAcquire(true);
+       WALInsertLockAcquireExclusive();
        xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
-       WALInsertSlotRelease();
+       WALInsertLockRelease();
 
        /* Also update the info_lck-protected copy */
        SpinLockAcquire(&xlogctl->info_lck);
@@ -8653,11 +8614,11 @@ CreateRestartPoint(int flags)
                _logSegNo--;
 
                /*
-                * Try to recycle segments on a useful timeline. If we've been promoted
-                * since the beginning of this restartpoint, use the new timeline
-                * chosen at end of recovery (RecoveryInProgress() sets ThisTimeLineID
-                * in that case). If we're still in recovery, use the timeline we're
-                * currently replaying.
+                * Try to recycle segments on a useful timeline. If we've been
+                * promoted since the beginning of this restartpoint, use the new
+                * timeline chosen at end of recovery (RecoveryInProgress() sets
+                * ThisTimeLineID in that case). If we're still in recovery, use the
+                * timeline we're currently replaying.
                 *
                 * There is no guarantee that the WAL segments will be useful on the
                 * current timeline; if recovery proceeds to a new timeline right
@@ -8687,15 +8648,30 @@ CreateRestartPoint(int flags)
                        ThisTimeLineID = 0;
        }
 
+       /*
+        * Due to an historical accident multixact truncations are not WAL-logged,
+        * but just performed everytime the mxact horizon is increased. So, unless
+        * we explicitly execute truncations on a standby it will never clean out
+        * /pg_multixact which obviously is bad, both because it uses space and
+        * because we can wrap around into pre-existing data...
+        *
+        * We can only do the truncation here, after the UpdateControlFile()
+        * above, because we've now safely established a restart point.  That
+        * guarantees we will not need to access those multis.
+        *
+        * It's probably worth improving this.
+        */
+       TruncateMultiXact();
+
        /*
         * Truncate pg_subtrans if possible.  We can throw away all data before
-        * the oldest XMIN of any running transaction.  No future transaction will
+        * the oldest XMIN of any running transaction.  No future transaction will
         * attempt to reference any pg_subtrans entry older than that (see Asserts
-        * in subtrans.c).      When hot standby is disabled, though, we mustn't do
+        * in subtrans.c).  When hot standby is disabled, though, we mustn't do
         * this because StartupSUBTRANS hasn't been called yet.
         */
        if (EnableHotStandby)
-               TruncateSUBTRANS(GetOldestXmin(true, false));
+               TruncateSUBTRANS(GetOldestXmin(NULL, false));
 
        /* Real work is done, but log and update before releasing lock. */
        LogCheckpointEnd(true);
@@ -8722,24 +8698,43 @@ CreateRestartPoint(int flags)
 
 /*
  * Retreat *logSegNo to the last segment that we need to retain because of
- * wal_keep_segments. This is calculated by subtracting wal_keep_segments
- * from the given xlog location, recptr.
+ * either wal_keep_segments or replication slots.
+ *
+ * This is calculated by subtracting wal_keep_segments from the given xlog
+ * location, recptr and by making sure that that result is below the
+ * requirement of replication slots.
  */
 static void
 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
 {
        XLogSegNo       segno;
-
-       if (wal_keep_segments == 0)
-               return;
+       XLogRecPtr      keep;
 
        XLByteToSeg(recptr, segno);
+       keep = XLogGetReplicationSlotMinimumLSN();
 
-       /* avoid underflow, don't go below 1 */
-       if (segno <= wal_keep_segments)
-               segno = 1;
-       else
-               segno = segno - wal_keep_segments;
+       /* compute limit for wal_keep_segments first */
+       if (wal_keep_segments > 0)
+       {
+               /* avoid underflow, don't go below 1 */
+               if (segno <= wal_keep_segments)
+                       segno = 1;
+               else
+                       segno = segno - wal_keep_segments;
+       }
+
+       /* then check whether slots limit removal further */
+       if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
+       {
+               XLogRecPtr      slotSegNo;
+
+               XLByteToSeg(keep, slotSegNo);
+
+               if (slotSegNo <= 0)
+                       segno = 1;
+               else if (slotSegNo < segno)
+                       segno = slotSegNo;
+       }
 
        /* don't delete WAL segments newer than the calculated segment */
        if (segno < *logSegNo)
@@ -8764,7 +8759,7 @@ XLogPutNextOid(Oid nextOid)
         * We need not flush the NEXTOID record immediately, because any of the
         * just-allocated OIDs could only reach disk as part of a tuple insert or
         * update that would have its own XLOG record that must follow the NEXTOID
-        * record.      Therefore, the standard buffer LSN interlock applied to those
+        * record.  Therefore, the standard buffer LSN interlock applied to those
         * records will ensure no such OID reaches disk before the NEXTOID record
         * does.
         *
@@ -8818,7 +8813,7 @@ XLogRestorePoint(const char *rpName)
        xl_restore_point xlrec;
 
        xlrec.rp_time = GetCurrentTimestamp();
-       strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
+       strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
 
        rdata.buffer = InvalidBuffer;
        rdata.data = (char *) &xlrec;
@@ -8893,8 +8888,9 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
                 * lsn updates. We assume pd_lower/upper cannot be changed without an
                 * exclusive lock, so the contents bkp are not racy.
                 *
-                * With buffer_std set to false, XLogCheckBuffer() sets hole_length and
-                * hole_offset to 0; so the following code is safe for either case.
+                * With buffer_std set to false, XLogCheckBuffer() sets hole_length
+                * and hole_offset to 0; so the following code is safe for either
+                * case.
                 */
                memcpy(copied_buffer, origdata, bkpb.hole_offset);
                memcpy(copied_buffer + bkpb.hole_offset,
@@ -8931,6 +8927,7 @@ static void
 XLogReportParameters(void)
 {
        if (wal_level != ControlFile->wal_level ||
+               wal_log_hints != ControlFile->wal_log_hints ||
                MaxConnections != ControlFile->MaxConnections ||
                max_worker_processes != ControlFile->max_worker_processes ||
                max_prepared_xacts != ControlFile->max_prepared_xacts ||
@@ -8947,19 +8944,22 @@ XLogReportParameters(void)
                {
                        XLogRecData rdata;
                        xl_parameter_change xlrec;
+                       XLogRecPtr      recptr;
 
                        xlrec.MaxConnections = MaxConnections;
                        xlrec.max_worker_processes = max_worker_processes;
                        xlrec.max_prepared_xacts = max_prepared_xacts;
                        xlrec.max_locks_per_xact = max_locks_per_xact;
                        xlrec.wal_level = wal_level;
+                       xlrec.wal_log_hints = wal_log_hints;
 
                        rdata.buffer = InvalidBuffer;
                        rdata.data = (char *) &xlrec;
                        rdata.len = sizeof(xlrec);
                        rdata.next = NULL;
 
-                       XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
+                       recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
+                       XLogFlush(recptr);
                }
 
                ControlFile->MaxConnections = MaxConnections;
@@ -8967,6 +8967,7 @@ XLogReportParameters(void)
                ControlFile->max_prepared_xacts = max_prepared_xacts;
                ControlFile->max_locks_per_xact = max_locks_per_xact;
                ControlFile->wal_level = wal_level;
+               ControlFile->wal_log_hints = wal_log_hints;
                UpdateControlFile();
        }
 }
@@ -9004,9 +9005,9 @@ UpdateFullPageWrites(void)
         */
        if (fullPageWrites)
        {
-               WALInsertSlotAcquire(true);
+               WALInsertLockAcquireExclusive();
                Insert->fullPageWrites = true;
-               WALInsertSlotRelease();
+               WALInsertLockRelease();
        }
 
        /*
@@ -9027,9 +9028,9 @@ UpdateFullPageWrites(void)
 
        if (!fullPageWrites)
        {
-               WALInsertSlotAcquire(true);
+               WALInsertLockAcquireExclusive();
                Insert->fullPageWrites = false;
-               WALInsertSlotRelease();
+               WALInsertLockRelease();
        }
        END_CRIT_SECTION();
 }
@@ -9046,7 +9047,7 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
        /* Check that the record agrees on what the current (old) timeline is */
        if (prevTLI != ThisTimeLineID)
                ereport(PANIC,
-                               (errmsg("unexpected prev timeline ID %u (current timeline ID %u) in checkpoint record",
+                               (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
                                                prevTLI, ThisTimeLineID)));
 
        /*
@@ -9101,7 +9102,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                /*
                 * We used to try to take the maximum of ShmemVariableCache->nextOid
                 * and the recorded nextOid, but that fails if the OID counter wraps
-                * around.      Since no OID allocation should be happening during replay
+                * around.  Since no OID allocation should be happening during replay
                 * anyway, better to just believe the record exactly.  We still take
                 * OidGenLock while setting the variable, just in case.
                 */
@@ -9128,6 +9129,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                                                          checkPoint.nextMultiOffset);
                SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
                SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
+               MultiXactSetSafeTruncate(checkPoint.oldestMulti);
 
                /*
                 * If we see a shutdown checkpoint while waiting for an end-of-backup
@@ -9228,6 +9230,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                                                                  checkPoint.oldestXidDB);
                MultiXactAdvanceOldest(checkPoint.oldestMulti,
                                                           checkPoint.oldestMultiDB);
+               MultiXactSetSafeTruncate(checkPoint.oldestMulti);
 
                /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
                ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
@@ -9291,10 +9294,10 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                BkpBlock        bkpb;
 
                /*
-                * Full-page image (FPI) records contain a backup block stored "inline"
-                * in the normal data since the locking when writing hint records isn't
-                * sufficient to use the normal backup block mechanism, which assumes
-                * exclusive lock on the buffer supplied.
+                * Full-page image (FPI) records contain a backup block stored
+                * "inline" in the normal data since the locking when writing hint
+                * records isn't sufficient to use the normal backup block mechanism,
+                * which assumes exclusive lock on the buffer supplied.
                 *
                 * Since the only change in these backup block are hint bits, there
                 * are no recovery conflicts generated.
@@ -9353,6 +9356,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
                ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
                ControlFile->wal_level = xlrec.wal_level;
+               ControlFile->wal_log_hints = wal_log_hints;
 
                /*
                 * Update minRecoveryPoint to ensure that if recovery is aborted, we
@@ -9443,7 +9447,7 @@ get_sync_bit(int method)
 
        /*
         * Optimize writes by bypassing kernel cache with O_DIRECT when using
-        * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
+        * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
         * disabled, otherwise the archive command or walsender process will read
         * the WAL soon after writing it, which is guaranteed to cause a physical
         * read if we bypassed the kernel cache. We also skip the
@@ -9594,6 +9598,9 @@ XLogFileNameP(TimeLineID tli, XLogSegNo segno)
  *
  * Every successfully started non-exclusive backup must be stopped by calling
  * do_pg_stop_backup() or do_pg_abort_backup().
+ *
+ * It is the responsibility of the caller of this function to verify the
+ * permissions of the calling user!
  */
 XLogRecPtr
 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
@@ -9614,11 +9621,6 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
 
        backup_started_in_recovery = RecoveryInProgress();
 
-       if (!superuser() && !has_rolreplication(GetUserId()))
-               ereport(ERROR,
-                               (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                  errmsg("must be superuser or replication role to run a backup")));
-
        /*
         * Currently only non-exclusive backup can be taken during recovery.
         */
@@ -9636,7 +9638,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
                ereport(ERROR,
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                          errmsg("WAL level not sufficient for making an online backup"),
-                                errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
+                                errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
 
        if (strlen(backupidstr) > MAXPGPATH)
                ereport(ERROR,
@@ -9649,7 +9651,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
         * during an on-line backup even if not doing so at other times, because
         * it's quite possible for the backup dump to obtain a "torn" (partially
         * written) copy of a database page if it reads the page concurrently with
-        * our write to the same page.  This can be fixed as long as the first
+        * our write to the same page.  This can be fixed as long as the first
         * write to the page in the WAL sequence is a full-page write. Hence, we
         * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
         * are no dirty pages in shared memory that might get dumped while the
@@ -9661,15 +9663,15 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
         * Note that forcePageWrites has no effect during an online backup from
         * the standby.
         *
-        * We must hold all the insertion slots to change the value of
+        * We must hold all the insertion locks to change the value of
         * forcePageWrites, to ensure adequate interlocking against XLogInsert().
         */
-       WALInsertSlotAcquire(true);
+       WALInsertLockAcquireExclusive();
        if (exclusive)
        {
                if (XLogCtl->Insert.exclusiveBackup)
                {
-                       WALInsertSlotRelease();
+                       WALInsertLockRelease();
                        ereport(ERROR,
                                        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                                         errmsg("a backup is already in progress"),
@@ -9680,7 +9682,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
        else
                XLogCtl->Insert.nonExclusiveBackups++;
        XLogCtl->Insert.forcePageWrites = true;
-       WALInsertSlotRelease();
+       WALInsertLockRelease();
 
        /* Ensure we release forcePageWrites if fail below */
        PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
@@ -9693,7 +9695,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
                 * old timeline IDs.  That would otherwise happen if you called
                 * pg_start_backup() right after restoring from a PITR archive: the
                 * first WAL segment containing the startup checkpoint has pages in
-                * the beginning with the old timeline ID.      That can cause trouble at
+                * the beginning with the old timeline ID.  That can cause trouble at
                 * recovery: we won't have a history file covering the old timeline if
                 * pg_xlog directory was not included in the base backup and the WAL
                 * archive was cleared too before starting the backup.
@@ -9716,7 +9718,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
                        bool            checkpointfpw;
 
                        /*
-                        * Force a CHECKPOINT.  Aside from being necessary to prevent torn
+                        * Force a CHECKPOINT.  Aside from being necessary to prevent torn
                         * page problems, this guarantees that two successive backup runs
                         * will have different checkpoint positions and hence different
                         * history file names, even if nothing happened in between.
@@ -9795,13 +9797,13 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
                         * taking a checkpoint right after another is not that expensive
                         * either because only few buffers have been dirtied yet.
                         */
-                       WALInsertSlotAcquire(true);
+                       WALInsertLockAcquireExclusive();
                        if (XLogCtl->Insert.lastBackupStart < startpoint)
                        {
                                XLogCtl->Insert.lastBackupStart = startpoint;
                                gotUniqueStartpoint = true;
                        }
-                       WALInsertSlotRelease();
+                       WALInsertLockRelease();
                } while (!gotUniqueStartpoint);
 
                XLByteToSeg(startpoint, _logSegNo);
@@ -9891,7 +9893,7 @@ pg_start_backup_callback(int code, Datum arg)
        bool            exclusive = DatumGetBool(arg);
 
        /* Update backup counters and forcePageWrites on failure */
-       WALInsertSlotAcquire(true);
+       WALInsertLockAcquireExclusive();
        if (exclusive)
        {
                Assert(XLogCtl->Insert.exclusiveBackup);
@@ -9908,7 +9910,7 @@ pg_start_backup_callback(int code, Datum arg)
        {
                XLogCtl->Insert.forcePageWrites = false;
        }
-       WALInsertSlotRelease();
+       WALInsertLockRelease();
 }
 
 /*
@@ -9920,6 +9922,9 @@ pg_start_backup_callback(int code, Datum arg)
  *
  * Returns the last WAL position that must be present to restore from this
  * backup, and the corresponding timeline ID in *stoptli_p.
+ *
+ * It is the responsibility of the caller of this function to verify the
+ * permissions of the calling user!
  */
 XLogRecPtr
 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
@@ -9952,11 +9957,6 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
 
        backup_started_in_recovery = RecoveryInProgress();
 
-       if (!superuser() && !has_rolreplication(GetUserId()))
-               ereport(ERROR,
-                               (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                (errmsg("must be superuser or replication role to run a backup"))));
-
        /*
         * Currently only non-exclusive backup can be taken during recovery.
         */
@@ -9974,12 +9974,12 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
                ereport(ERROR,
                                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                          errmsg("WAL level not sufficient for making an online backup"),
-                                errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
+                                errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
 
        /*
         * OK to update backup counters and forcePageWrites
         */
-       WALInsertSlotAcquire(true);
+       WALInsertLockAcquireExclusive();
        if (exclusive)
                XLogCtl->Insert.exclusiveBackup = false;
        else
@@ -9999,7 +9999,7 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
        {
                XLogCtl->Insert.forcePageWrites = false;
        }
-       WALInsertSlotRelease();
+       WALInsertLockRelease();
 
        if (exclusive)
        {
@@ -10278,13 +10278,13 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
  * an error handler.
  *
  * NB: This is only for aborting a non-exclusive backup that doesn't write
- * backup_label. A backup started with pg_stop_backup() needs to be finished
+ * backup_label. A backup started with pg_start_backup() needs to be finished
  * with pg_stop_backup().
  */
 void
 do_pg_abort_backup(void)
 {
-       WALInsertSlotAcquire(true);
+       WALInsertLockAcquireExclusive();
        Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
        XLogCtl->Insert.nonExclusiveBackups--;
 
@@ -10293,7 +10293,7 @@ do_pg_abort_backup(void)
        {
                XLogCtl->Insert.forcePageWrites = false;
        }
-       WALInsertSlotRelease();
+       WALInsertLockRelease();
 }
 
 /*
@@ -10371,7 +10371,7 @@ GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
  *
  * If we see a backup_label during recovery, we assume that we are recovering
  * from a backup dump file, and we therefore roll forward from the checkpoint
- * identified by the label file, NOT what pg_control says.     This avoids the
+ * identified by the label file, NOT what pg_control says.  This avoids the
  * problem that pg_control might have been archived one or more checkpoints
  * later than the start of the dump, and so if we rely on it as the start
  * point, we will fail to restore a consistent database state.
@@ -10466,9 +10466,7 @@ rm_redo_error_callback(void *arg)
        StringInfoData buf;
 
        initStringInfo(&buf);
-       RmgrTable[record->xl_rmid].rm_desc(&buf,
-                                                                          record->xl_info,
-                                                                          XLogRecGetData(record));
+       RmgrTable[record->xl_rmid].rm_desc(&buf, record);
 
        /* don't bother emitting empty description */
        if (buf.len > 0)
@@ -10698,7 +10696,7 @@ next_record_is_invalid:
  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
  * used to decide which timeline to stream the requested WAL from.
  *
- * If the the record is not immediately available, the function returns false
+ * If the record is not immediately available, the function returns false
  * if we're not in standby mode. In standby mode, waits for it to become
  * available.
  *
@@ -10717,17 +10715,15 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
        /*-------
         * Standby mode is implemented by a state machine:
         *
-        * 1. Read from archive (XLOG_FROM_ARCHIVE)
-        * 2. Read from pg_xlog (XLOG_FROM_PG_XLOG)
-        * 3. Check trigger file
-        * 4. Read from primary server via walreceiver (XLOG_FROM_STREAM)
-        * 5. Rescan timelines
-        * 6. Sleep 5 seconds, and loop back to 1.
+        * 1. Read from either archive or pg_xlog (XLOG_FROM_ARCHIVE), or just
+        *        pg_xlog (XLOG_FROM_XLOG)
+        * 2. Check trigger file
+        * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
+        * 4. Rescan timelines
+        * 5. Sleep 5 seconds, and loop back to 1.
         *
         * Failure to read from the current source advances the state machine to
-        * the next state. In addition, successfully reading a file from pg_xlog
-        * moves the state machine from state 2 back to state 1 (we always prefer
-        * files in the archive over files in pg_xlog).
+        * the next state.
         *
         * 'currentSource' indicates the current state. There are no currentSource
         * values for "check trigger", "rescan timelines", and "sleep" states,
@@ -10755,9 +10751,6 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
                        switch (currentSource)
                        {
                                case XLOG_FROM_ARCHIVE:
-                                       currentSource = XLOG_FROM_PG_XLOG;
-                                       break;
-
                                case XLOG_FROM_PG_XLOG:
 
                                        /*
@@ -10810,7 +10803,8 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
                                                                         tli, curFileTLI);
                                                }
                                                curFileTLI = tli;
-                                               RequestXLogStreaming(tli, ptr, PrimaryConnInfo);
+                                               RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
+                                                                                        PrimarySlotName);
                                                receivedUpto = 0;
                                        }
 
@@ -10922,7 +10916,9 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
                                 * Try to restore the file from archive, or read an existing
                                 * file from pg_xlog.
                                 */
-                               readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource);
+                               readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
+                                                currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
+                                                                                         currentSource);
                                if (readFile >= 0)
                                        return true;    /* success! */
 
@@ -10979,11 +10975,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
                                        if (havedata)
                                        {
                                                /*
-                                                * Great, streamed far enough.  Open the file if it's
+                                                * Great, streamed far enough.  Open the file if it's
                                                 * not open already.  Also read the timeline history
                                                 * file if we haven't initialized timeline history
                                                 * yet; it should be streamed over and present in
-                                                * pg_xlog by now.      Use XLOG_FROM_STREAM so that
+                                                * pg_xlog by now.  Use XLOG_FROM_STREAM so that
                                                 * source info is set correctly and XLogReceiptTime
                                                 * isn't changed.
                                                 */
@@ -11046,9 +11042,9 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
                 * process.
                 */
                HandleStartupProcInterrupts();
-       } while (StandbyMode);
+       }
 
-       return false;
+       return false;                           /* not reached */
 }
 
 /*
@@ -11056,9 +11052,9 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
  * in the current WAL page, previously read by XLogPageRead().
  *
  * 'emode' is the error mode that would be used to report a file-not-found
- * or legitimate end-of-WAL situation.  Generally, we use it as-is, but if
+ * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
  * we're retrying the exact same record that we've tried previously, only
- * complain the first time to keep the noise down.     However, we only do when
+ * complain the first time to keep the noise down.  However, we only do when
  * reading from pg_xlog, because we don't expect any invalid records in archive
  * or in records streamed from master. Files in the archive should be complete,
  * and we should never hit the end of WAL because we stop and wait for more WAL
@@ -11101,19 +11097,20 @@ CheckForStandbyTrigger(void)
        {
                /*
                 * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
-                * signal handler. We now leave the file in place and let the Startup
-                * process do the unlink. This allows Startup to know whether we're
-                * doing fast or normal promotion. Fast promotion takes precedence.
+                * signal handler. It now leaves the file in place and lets the
+                * Startup process do the unlink. This allows Startup to know whether
+                * it should create a full checkpoint before starting up (fallback
+                * mode). Fast promotion takes precedence.
                 */
-               if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+               if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
                {
-                       unlink(FAST_PROMOTE_SIGNAL_FILE);
                        unlink(PROMOTE_SIGNAL_FILE);
+                       unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
                        fast_promote = true;
                }
-               else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+               else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
                {
-                       unlink(PROMOTE_SIGNAL_FILE);
+                       unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
                        fast_promote = false;
                }
 
@@ -11136,6 +11133,12 @@ CheckForStandbyTrigger(void)
                fast_promote = true;
                return true;
        }
+       else if (errno != ENOENT)
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not stat trigger file \"%s\": %m",
+                                               TriggerFile)));
+
        return false;
 }
 
@@ -11149,7 +11152,7 @@ CheckPromoteSignal(void)
        struct stat stat_buf;
 
        if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
-               stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+               stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
                return true;
 
        return false;