]> granicus.if.org Git - postgresql/blobdiff - src/backend/access/transam/xact.c
Enable probes to work with Mac OS X Leopard and other OSes that will
[postgresql] / src / backend / access / transam / xact.c
index 0bbe2c0d4956272b4230403e7f694298e2e3ff61..9af53a5953f09e2b2726fc7efa8ad4fcc1f2b9ab 100644 (file)
@@ -5,12 +5,12 @@
  *
  * See src/backend/access/transam/README for more information.
  *
- * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.219 2006/03/29 21:17:37 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.260 2008/03/17 19:44:41 petere Exp $
  *
  *-------------------------------------------------------------------------
  */
 
 #include "access/multixact.h"
 #include "access/subtrans.h"
+#include "access/transam.h"
 #include "access/twophase.h"
 #include "access/xact.h"
-#include "catalog/heap.h"
-#include "catalog/index.h"
+#include "access/xlogutils.h"
 #include "catalog/namespace.h"
 #include "commands/async.h"
 #include "commands/tablecmds.h"
 #include "executor/spi.h"
 #include "libpq/be-fsstubs.h"
 #include "miscadmin.h"
+#include "pgstat.h"
 #include "storage/fd.h"
-#include "storage/proc.h"
+#include "storage/lmgr.h"
 #include "storage/procarray.h"
+#include "storage/sinvaladt.h"
 #include "storage/smgr.h"
+#include "utils/combocid.h"
 #include "utils/flatfiles.h"
 #include "utils/guc.h"
 #include "utils/inval.h"
 #include "utils/memutils.h"
-#include "utils/portal.h"
 #include "utils/relcache.h"
-#include "utils/resowner.h"
-#include "pgstat.h"
+#include "utils/xml.h"
+#include "pg_trace.h"
 
 
 /*
@@ -56,21 +58,30 @@ int                 XactIsoLevel;
 bool           DefaultXactReadOnly = false;
 bool           XactReadOnly;
 
+bool           XactSyncCommit = true;
+
 int                    CommitDelay = 0;        /* precommit delay in microseconds */
 int                    CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 
+/*
+ * MyXactAccessedTempRel is set when a temporary relation is accessed.
+ * We don't allow PREPARE TRANSACTION in that case.  (This is global
+ * so that it can be set from heapam.c.)
+ */
+bool           MyXactAccessedTempRel = false;
+
 
 /*
  *     transaction states - transaction state from server perspective
  */
 typedef enum TransState
 {
-       TRANS_DEFAULT,
-       TRANS_START,
-       TRANS_INPROGRESS,
-       TRANS_COMMIT,
-       TRANS_ABORT,
-       TRANS_PREPARE
+       TRANS_DEFAULT,                          /* idle */
+       TRANS_START,                            /* transaction starting */
+       TRANS_INPROGRESS,                       /* inside a valid transaction */
+       TRANS_COMMIT,                           /* commit in progress */
+       TRANS_ABORT,                            /* abort in progress */
+       TRANS_PREPARE                           /* prepare in progress */
 } TransState;
 
 /*
@@ -116,25 +127,21 @@ typedef struct TransactionStateData
        int                     savepointLevel; /* savepoint level */
        TransState      state;                  /* low-level state */
        TBlockState blockState;         /* high-level state */
-       int                     nestingLevel;   /* nest depth */
+       int                     nestingLevel;   /* transaction nesting depth */
+       int                     gucNestLevel;   /* GUC context nesting depth */
        MemoryContext curTransactionContext;            /* my xact-lifetime context */
        ResourceOwner curTransactionOwner;      /* my query resources */
-       List       *childXids;          /* subcommitted child XIDs */
-       Oid                     currentUser;    /* subxact start current_user */
+       TransactionId *childXids;       /* subcommitted child XIDs, in XID order */
+       int                     nChildXids;             /* # of subcommitted child XIDs */
+       int                     maxChildXids;   /* allocated size of childXids[] */
+       Oid                     prevUser;               /* previous CurrentUserId setting */
+       bool            prevSecDefCxt;  /* previous SecurityDefinerContext setting */
        bool            prevXactReadOnly;               /* entry-time xact r/o state */
        struct TransactionStateData *parent;            /* back link to parent */
 } TransactionStateData;
 
 typedef TransactionStateData *TransactionState;
 
-/*
- * childXids is currently implemented as an Oid List, relying on the
- * assumption that TransactionIds are no wider than Oid.  We use these
- * macros to provide some isolation in case that changes in the future.
- */
-#define lfirst_xid(lc)                         ((TransactionId) lfirst_oid(lc))
-#define lappend_xid(list, datum)       lappend_oid(list, (Oid) (datum))
-
 /*
  * CurrentTransactionState always points to the current transaction state
  * block.  It will point to TopTransactionStateData when not in a
@@ -148,11 +155,15 @@ static TransactionStateData TopTransactionStateData = {
        TRANS_DEFAULT,                          /* transaction state */
        TBLOCK_DEFAULT,                         /* transaction block state from the client
                                                                 * perspective */
-       0,                                                      /* nesting level */
+       0,                                                      /* transaction nesting depth */
+       0,                                                      /* GUC context nesting depth */
        NULL,                                           /* cur transaction context */
        NULL,                                           /* cur transaction resource owner */
-       NIL,                                            /* subcommitted child Xids */
-       0,                                                      /* entry-time current userid */
+       NULL,                                           /* subcommitted child Xids */
+       0,                                                      /* # of subcommitted child Xids */
+       0,                                                      /* allocated size of childXids[] */
+       InvalidOid,                                     /* previous CurrentUserId setting */
+       false,                                          /* previous SecurityDefinerContext setting */
        false,                                          /* entry-time xact r/o state */
        NULL                                            /* link to parent state block */
 };
@@ -165,13 +176,18 @@ static TransactionState CurrentTransactionState = &TopTransactionStateData;
  */
 static SubTransactionId currentSubTransactionId;
 static CommandId currentCommandId;
+static bool currentCommandIdUsed;
 
 /*
- * This is the value of now(), ie, the transaction start time.
- * This does not change as we enter and exit subtransactions, so we don't
- * keep it inside the TransactionState stack.
+ * xactStartTimestamp is the value of transaction_timestamp().
+ * stmtStartTimestamp is the value of statement_timestamp().
+ * xactStopTimestamp is the time at which we log a commit or abort WAL record.
+ * These do not change as we enter and exit subtransactions, so we don't
+ * keep them inside the TransactionState stack.
  */
 static TimestampTz xactStartTimestamp;
+static TimestampTz stmtStartTimestamp;
+static TimestampTz xactStopTimestamp;
 
 /*
  * GID to be used for preparing the current transaction.  This is also
@@ -179,6 +195,17 @@ static TimestampTz xactStartTimestamp;
  */
 static char *prepareGID;
 
+/*
+ * Some commands want to force synchronous commit.
+ */
+static bool forceSyncCommit = false;
+
+/*
+ * Private context for transaction-abort work --- we reserve space for this
+ * at startup to ensure that AbortTransaction and AbortSubTransaction can work
+ * when we've run out of memory.
+ */
+static MemoryContext TransactionAbortContext = NULL;
 
 /*
  * List of add-on start- and end-of-xact callbacks
@@ -206,7 +233,7 @@ static SubXactCallbackItem *SubXact_callbacks = NULL;
 
 
 /* local function prototypes */
-static void AssignSubTransactionId(TransactionState s);
+static void AssignTransactionId(TransactionState s);
 static void AbortTransaction(void);
 static void AtAbort_Memory(void);
 static void AtCleanup_Memory(void);
@@ -222,7 +249,7 @@ static void CallSubXactCallbacks(SubXactEvent event,
                                         SubTransactionId parentSubid);
 static void CleanupTransaction(void);
 static void CommitTransaction(void);
-static void RecordTransactionAbort(void);
+static TransactionId RecordTransactionAbort(bool isSubXact);
 static void StartTransaction(void);
 
 static void RecordSubTransactionCommit(void);
@@ -254,34 +281,22 @@ static const char *TransStateAsString(TransState state);
 /*
  *     IsTransactionState
  *
- *     This returns true if we are currently running a query
- *     within an executing transaction.
+ *     This returns true if we are inside a valid transaction; that is,
+ *     it is safe to initiate database access, take heavyweight locks, etc.
  */
 bool
 IsTransactionState(void)
 {
        TransactionState s = CurrentTransactionState;
 
-       switch (s->state)
-       {
-               case TRANS_DEFAULT:
-                       return false;
-               case TRANS_START:
-                       return true;
-               case TRANS_INPROGRESS:
-                       return true;
-               case TRANS_COMMIT:
-                       return true;
-               case TRANS_ABORT:
-                       return true;
-               case TRANS_PREPARE:
-                       return true;
-       }
-
        /*
-        * Shouldn't get here, but lint is not happy without this...
+        * TRANS_DEFAULT and TRANS_ABORT are obviously unsafe states.  However, we
+        * also reject the startup/shutdown states TRANS_START, TRANS_COMMIT,
+        * TRANS_PREPARE since it might be too soon or too late within those
+        * transition states to do anything interesting.  Hence, the only "valid"
+        * state is TRANS_INPROGRESS.
         */
-       return false;
+       return (s->state == TRANS_INPROGRESS);
 }
 
 /*
@@ -306,23 +321,36 @@ IsAbortedTransactionBlockState(void)
 /*
  *     GetTopTransactionId
  *
- * Get the ID of the main transaction, even if we are currently inside
- * a subtransaction.
+ * This will return the XID of the main transaction, assigning one if
+ * it's not yet set.  Be careful to call this only inside a valid xact.
  */
 TransactionId
 GetTopTransactionId(void)
 {
+       if (!TransactionIdIsValid(TopTransactionStateData.transactionId))
+               AssignTransactionId(&TopTransactionStateData);
        return TopTransactionStateData.transactionId;
 }
 
+/*
+ *     GetTopTransactionIdIfAny
+ *
+ * This will return the XID of the main transaction, if one is assigned.
+ * It will return InvalidTransactionId if we are not currently inside a
+ * transaction, or inside a transaction that hasn't yet been assigned an XID.
+ */
+TransactionId
+GetTopTransactionIdIfAny(void)
+{
+       return TopTransactionStateData.transactionId;
+}
 
 /*
  *     GetCurrentTransactionId
  *
- * We do not assign XIDs to subtransactions until/unless this is called.
- * When we do assign an XID to a subtransaction, recursively make sure
- * its parent has one as well (this maintains the invariant that a child
- * transaction has an XID following its parent's).
+ * This will return the XID of the current transaction (main or sub
+ * transaction), assigning one if it's not yet set.  Be careful to call this
+ * only inside a valid xact.
  */
 TransactionId
 GetCurrentTransactionId(void)
@@ -330,20 +358,49 @@ GetCurrentTransactionId(void)
        TransactionState s = CurrentTransactionState;
 
        if (!TransactionIdIsValid(s->transactionId))
-               AssignSubTransactionId(s);
-
+               AssignTransactionId(s);
        return s->transactionId;
 }
 
+/*
+ *     GetCurrentTransactionIdIfAny
+ *
+ * This will return the XID of the current sub xact, if one is assigned.
+ * It will return InvalidTransactionId if we are not currently inside a
+ * transaction, or inside a transaction that hasn't been assigned an XID yet.
+ */
+TransactionId
+GetCurrentTransactionIdIfAny(void)
+{
+       return CurrentTransactionState->transactionId;
+}
+
+
+/*
+ * AssignTransactionId
+ *
+ * Assigns a new permanent XID to the given TransactionState.
+ * We do not assign XIDs to transactions until/unless this is called.
+ * Also, any parent TransactionStates that don't yet have XIDs are assigned
+ * one; this maintains the invariant that a child transaction has an XID
+ * following its parent's.
+ */
 static void
-AssignSubTransactionId(TransactionState s)
+AssignTransactionId(TransactionState s)
 {
+       bool            isSubXact = (s->parent != NULL);
        ResourceOwner currentOwner;
 
-       Assert(s->parent != NULL);
+       /* Assert that caller didn't screw up */
+       Assert(!TransactionIdIsValid(s->transactionId));
        Assert(s->state == TRANS_INPROGRESS);
-       if (!TransactionIdIsValid(s->parent->transactionId))
-               AssignSubTransactionId(s->parent);
+
+       /*
+        * Ensure parent(s) have XIDs, so that a child always has an XID later
+        * than its parent.
+        */
+       if (isSubXact && !TransactionIdIsValid(s->parent->transactionId))
+               AssignTransactionId(s->parent);
 
        /*
         * Generate a new Xid and record it in PG_PROC and pg_subtrans.
@@ -353,20 +410,20 @@ AssignSubTransactionId(TransactionState s)
         * PG_PROC, the subtrans entry is needed to ensure that other backends see
         * the Xid as "running".  See GetNewTransactionId.
         */
-       s->transactionId = GetNewTransactionId(true);
+       s->transactionId = GetNewTransactionId(isSubXact);
 
-       SubTransSetParent(s->transactionId, s->parent->transactionId);
+       if (isSubXact)
+               SubTransSetParent(s->transactionId, s->parent->transactionId);
 
        /*
         * Acquire lock on the transaction XID.  (We assume this cannot block.) We
-        * have to be sure that the lock is assigned to the transaction's
+        * have to ensure that the lock is assigned to the transaction's own
         * ResourceOwner.
         */
        currentOwner = CurrentResourceOwner;
        PG_TRY();
        {
                CurrentResourceOwner = s->curTransactionOwner;
-
                XactLockTableInsert(s->transactionId);
        }
        PG_CATCH();
@@ -380,22 +437,6 @@ AssignSubTransactionId(TransactionState s)
 }
 
 
-/*
- *     GetCurrentTransactionIdIfAny
- *
- * Unlike GetCurrentTransactionId, this will return InvalidTransactionId
- * if we are currently not in a transaction, or in a transaction or
- * subtransaction that has not yet assigned itself an XID.
- */
-TransactionId
-GetCurrentTransactionIdIfAny(void)
-{
-       TransactionState s = CurrentTransactionState;
-
-       return s->transactionId;
-}
-
-
 /*
  *     GetCurrentSubTransactionId
  */
@@ -410,11 +451,18 @@ GetCurrentSubTransactionId(void)
 
 /*
  *     GetCurrentCommandId
+ *
+ * "used" must be TRUE if the caller intends to use the command ID to mark
+ * inserted/updated/deleted tuples.  FALSE means the ID is being fetched
+ * for read-only purposes (ie, as a snapshot validity cutoff).  See
+ * CommandCounterIncrement() for discussion.
  */
 CommandId
-GetCurrentCommandId(void)
+GetCurrentCommandId(bool used)
 {
        /* this is global to a transaction, not subtransaction-local */
+       if (used)
+               currentCommandIdUsed = true;
        return currentCommandId;
 }
 
@@ -427,6 +475,47 @@ GetCurrentTransactionStartTimestamp(void)
        return xactStartTimestamp;
 }
 
+/*
+ *     GetCurrentStatementStartTimestamp
+ */
+TimestampTz
+GetCurrentStatementStartTimestamp(void)
+{
+       return stmtStartTimestamp;
+}
+
+/*
+ *     GetCurrentTransactionStopTimestamp
+ *
+ * We return current time if the transaction stop time hasn't been set
+ * (which can happen if we decide we don't need to log an XLOG record).
+ */
+TimestampTz
+GetCurrentTransactionStopTimestamp(void)
+{
+       if (xactStopTimestamp != 0)
+               return xactStopTimestamp;
+       return GetCurrentTimestamp();
+}
+
+/*
+ *     SetCurrentStatementStartTimestamp
+ */
+void
+SetCurrentStatementStartTimestamp(void)
+{
+       stmtStartTimestamp = GetCurrentTimestamp();
+}
+
+/*
+ *     SetCurrentTransactionStopTimestamp
+ */
+static inline void
+SetCurrentTransactionStopTimestamp(void)
+{
+       xactStopTimestamp = GetCurrentTimestamp();
+}
+
 /*
  *     GetCurrentTransactionNestLevel
  *
@@ -458,8 +547,12 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
         * is what we need during bootstrap.  (Bootstrap mode only inserts tuples,
         * it never updates or deletes them, so all tuples can be presumed good
         * immediately.)
+        *
+        * Likewise, InvalidTransactionId and FrozenTransactionId are certainly
+        * not my transaction ID, so we can just return "false" immediately for
+        * any non-normal XID.
         */
-       if (xid == BootstrapTransactionId)
+       if (!TransactionIdIsNormal(xid))
                return false;
 
        /*
@@ -471,7 +564,7 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
         */
        for (s = CurrentTransactionState; s != NULL; s = s->parent)
        {
-               ListCell   *cell;
+               int low, high;
 
                if (s->state == TRANS_ABORT)
                        continue;
@@ -479,10 +572,22 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
                        continue;                       /* it can't have any child XIDs either */
                if (TransactionIdEquals(xid, s->transactionId))
                        return true;
-               foreach(cell, s->childXids)
+               /* As the childXids array is ordered, we can use binary search */
+               low = 0;
+               high = s->nChildXids - 1;
+               while (low <= high)
                {
-                       if (TransactionIdEquals(xid, lfirst_xid(cell)))
+                       int                             middle;
+                       TransactionId   probe;
+
+                       middle = low + (high - low) / 2;
+                       probe = s->childXids[middle];
+                       if (TransactionIdEquals(probe, xid))
                                return true;
+                       else if (TransactionIdPrecedes(probe, xid))
+                               low = middle + 1;
+                       else
+                               high = middle - 1;
                }
        }
 
@@ -496,28 +601,65 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
 void
 CommandCounterIncrement(void)
 {
-       currentCommandId += 1;
-       if (currentCommandId == FirstCommandId)         /* check for overflow */
+       /*
+        * If the current value of the command counter hasn't been "used" to
+        * mark tuples, we need not increment it, since there's no need to
+        * distinguish a read-only command from others.  This helps postpone
+        * command counter overflow, and keeps no-op CommandCounterIncrement
+        * operations cheap.
+        */
+       if (currentCommandIdUsed)
        {
-               currentCommandId -= 1;
-               ereport(ERROR,
-                               (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+               currentCommandId += 1;
+               if (currentCommandId == FirstCommandId) /* check for overflow */
+               {
+                       currentCommandId -= 1;
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                  errmsg("cannot have more than 2^32-1 commands in a transaction")));
-       }
+               }
+               currentCommandIdUsed = false;
+
+               /* Propagate new command ID into static snapshots, if set */
+               if (SerializableSnapshot)
+                       SerializableSnapshot->curcid = currentCommandId;
+               if (LatestSnapshot)
+                       LatestSnapshot->curcid = currentCommandId;
 
-       /* Propagate new command ID into static snapshots, if set */
-       if (SerializableSnapshot)
-               SerializableSnapshot->curcid = currentCommandId;
-       if (LatestSnapshot)
-               LatestSnapshot->curcid = currentCommandId;
+               /*
+                * Make any catalog changes done by the just-completed command
+                * visible in the local syscache.  We obviously don't need to do
+                * this after a read-only command.  (But see hacks in inval.c
+                * to make real sure we don't think a command that queued inval
+                * messages was read-only.)
+                */
+               AtCommit_LocalCache();
+       }
 
        /*
-        * make cache changes visible to me.
+        * Make any other backends' catalog changes visible to me.
+        *
+        * XXX this is probably in the wrong place: CommandCounterIncrement
+        * should be purely a local operation, most likely.  However fooling
+        * with this will affect asynchronous cross-backend interactions,
+        * which doesn't seem like a wise thing to do in late beta, so save
+        * improving this for another day - tgl 2007-11-30
         */
-       AtCommit_LocalCache();
        AtStart_Cache();
 }
 
+/*
+ * ForceSyncCommit
+ *
+ * Interface routine to allow commands to force a synchronous commit of the
+ * current top-level transaction
+ */
+void
+ForceSyncCommit(void)
+{
+       forceSyncCommit = true;
+}
+
 
 /* ----------------------------------------------------------------
  *                                             StartTransaction stuff
@@ -541,6 +683,21 @@ AtStart_Memory(void)
 {
        TransactionState s = CurrentTransactionState;
 
+       /*
+        * If this is the first time through, create a private context for
+        * AbortTransaction to work in.  By reserving some space now, we can
+        * insulate AbortTransaction from out-of-memory scenarios.      Like
+        * ErrorContext, we set it up with slow growth rate and a nonzero minimum
+        * size, so that space will be reserved immediately.
+        */
+       if (TransactionAbortContext == NULL)
+               TransactionAbortContext =
+                       AllocSetContextCreate(TopMemoryContext,
+                                                                 "TransactionAbortContext",
+                                                                 32 * 1024,
+                                                                 32 * 1024,
+                                                                 32 * 1024);
+
        /*
         * We shouldn't have a transaction context already.
         */
@@ -650,162 +807,203 @@ AtSubStart_ResourceOwner(void)
 
 /*
  *     RecordTransactionCommit
+ *
+ * Returns latest XID among xact and its children, or InvalidTransactionId
+ * if the xact has no XID.     (We compute that here just because it's easier.)
+ *
+ * This is exported only to support an ugly hack in VACUUM FULL.
  */
-void
+TransactionId
 RecordTransactionCommit(void)
 {
+       TransactionId xid = GetTopTransactionIdIfAny();
+       bool            markXidCommitted = TransactionIdIsValid(xid);
+       TransactionId latestXid = InvalidTransactionId;
        int                     nrels;
        RelFileNode *rels;
+       bool            haveNonTemp;
        int                     nchildren;
        TransactionId *children;
 
        /* Get data needed for commit record */
-       nrels = smgrGetPendingDeletes(true, &rels);
+       nrels = smgrGetPendingDeletes(true, &rels, &haveNonTemp);
        nchildren = xactGetCommittedChildren(&children);
 
        /*
-        * If we made neither any XLOG entries nor any temp-rel updates, and have
-        * no files to be deleted, we can omit recording the transaction commit at
-        * all.  (This test includes the effects of subtransactions, so the
-        * presence of committed subxacts need not alone force a write.)
+        * If we haven't been assigned an XID yet, we neither can, nor do we want
+        * to write a COMMIT record.
         */
-       if (MyXactMadeXLogEntry || MyXactMadeTempRelUpdate || nrels > 0)
+       if (!markXidCommitted)
        {
-               TransactionId xid = GetCurrentTransactionId();
-               bool            madeTCentries;
-               XLogRecPtr      recptr;
+               /*
+                * We expect that every smgrscheduleunlink is followed by a catalog
+                * update, and hence XID assignment, so we shouldn't get here with any
+                * pending deletes.  Use a real test not just an Assert to check this,
+                * since it's a bit fragile.
+                */
+               if (nrels != 0)
+                       elog(ERROR, "cannot commit a transaction that deleted files but has no xid");
+
+               /* Can't have child XIDs either; AssignTransactionId enforces this */
+               Assert(nchildren == 0);
+
+               /*
+                * If we didn't create XLOG entries, we're done here; otherwise we
+                * should flush those entries the same as a commit record.      (An
+                * example of a possible record that wouldn't cause an XID to be
+                * assigned is a sequence advance record due to nextval() --- we want
+                * to flush that to disk before reporting commit.)
+                */
+               if (XactLastRecEnd.xrecoff == 0)
+                       goto cleanup;
+       }
+       else
+       {
+               /*
+                * Begin commit critical section and insert the commit XLOG record.
+                */
+               XLogRecData rdata[3];
+               int                     lastrdata = 0;
+               xl_xact_commit xlrec;
 
                /* Tell bufmgr and smgr to prepare for commit */
                BufmgrCommit();
 
-               START_CRIT_SECTION();
-
                /*
-                * If our transaction made any transaction-controlled XLOG entries, we
-                * need to lock out checkpoint start between writing our XLOG record
-                * and updating pg_clog.  Otherwise it is possible for the checkpoint
-                * to set REDO after the XLOG record but fail to flush the pg_clog
-                * update to disk, leading to loss of the transaction commit if we
-                * crash a little later.  Slightly klugy fix for problem discovered
-                * 2004-08-10.
+                * Mark ourselves as within our "commit critical section".      This
+                * forces any concurrent checkpoint to wait until we've updated
+                * pg_clog.  Without this, it is possible for the checkpoint to set
+                * REDO after the XLOG record but fail to flush the pg_clog update to
+                * disk, leading to loss of the transaction commit if the system
+                * crashes a little later.
                 *
-                * (If it made no transaction-controlled XLOG entries, its XID appears
-                * nowhere in permanent storage, so no one else will ever care if it
-                * committed; so it doesn't matter if we lose the commit flag.)
+                * Note: we could, but don't bother to, set this flag in
+                * RecordTransactionAbort.      That's because loss of a transaction abort
+                * is noncritical; the presumption would be that it aborted, anyway.
                 *
-                * Note we only need a shared lock.
-                */
-               madeTCentries = (MyLastRecPtr.xrecoff != 0);
-               if (madeTCentries)
-                       LWLockAcquire(CheckpointStartLock, LW_SHARED);
-
-               /*
-                * We only need to log the commit in XLOG if the transaction made any
-                * transaction-controlled XLOG entries or will delete files.
+                * It's safe to change the inCommit flag of our own backend without
+                * holding the ProcArrayLock, since we're the only one modifying it.
+                * This makes checkpoint's determination of which xacts are inCommit a
+                * bit fuzzy, but it doesn't matter.
                 */
-               if (madeTCentries || nrels > 0)
+               START_CRIT_SECTION();
+               MyProc->inCommit = true;
+
+               SetCurrentTransactionStopTimestamp();
+               xlrec.xact_time = xactStopTimestamp;
+               xlrec.nrels = nrels;
+               xlrec.nsubxacts = nchildren;
+               rdata[0].data = (char *) (&xlrec);
+               rdata[0].len = MinSizeOfXactCommit;
+               rdata[0].buffer = InvalidBuffer;
+               /* dump rels to delete */
+               if (nrels > 0)
                {
-                       XLogRecData rdata[3];
-                       int                     lastrdata = 0;
-                       xl_xact_commit xlrec;
-
-                       xlrec.xtime = time(NULL);
-                       xlrec.nrels = nrels;
-                       xlrec.nsubxacts = nchildren;
-                       rdata[0].data = (char *) (&xlrec);
-                       rdata[0].len = MinSizeOfXactCommit;
-                       rdata[0].buffer = InvalidBuffer;
-                       /* dump rels to delete */
-                       if (nrels > 0)
-                       {
-                               rdata[0].next = &(rdata[1]);
-                               rdata[1].data = (char *) rels;
-                               rdata[1].len = nrels * sizeof(RelFileNode);
-                               rdata[1].buffer = InvalidBuffer;
-                               lastrdata = 1;
-                       }
-                       /* dump committed child Xids */
-                       if (nchildren > 0)
-                       {
-                               rdata[lastrdata].next = &(rdata[2]);
-                               rdata[2].data = (char *) children;
-                               rdata[2].len = nchildren * sizeof(TransactionId);
-                               rdata[2].buffer = InvalidBuffer;
-                               lastrdata = 2;
-                       }
-                       rdata[lastrdata].next = NULL;
-
-                       recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, rdata);
+                       rdata[0].next = &(rdata[1]);
+                       rdata[1].data = (char *) rels;
+                       rdata[1].len = nrels * sizeof(RelFileNode);
+                       rdata[1].buffer = InvalidBuffer;
+                       lastrdata = 1;
                }
-               else
+               /* dump committed child Xids */
+               if (nchildren > 0)
                {
-                       /* Just flush through last record written by me */
-                       recptr = ProcLastRecEnd;
+                       rdata[lastrdata].next = &(rdata[2]);
+                       rdata[2].data = (char *) children;
+                       rdata[2].len = nchildren * sizeof(TransactionId);
+                       rdata[2].buffer = InvalidBuffer;
+                       lastrdata = 2;
                }
+               rdata[lastrdata].next = NULL;
 
+               (void) XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, rdata);
+       }
+
+       /*
+        * Check if we want to commit asynchronously.  If the user has set
+        * synchronous_commit = off, and we're not doing cleanup of any non-temp
+        * rels nor committing any command that wanted to force sync commit, then
+        * we can defer flushing XLOG.  (We must not allow asynchronous commit if
+        * there are any non-temp tables to be deleted, because we might delete
+        * the files before the COMMIT record is flushed to disk.  We do allow
+        * asynchronous commit if all to-be-deleted tables are temporary though,
+        * since they are lost anyway if we crash.)
+        */
+       if (XactSyncCommit || forceSyncCommit || haveNonTemp)
+       {
                /*
-                * We must flush our XLOG entries to disk if we made any XLOG entries,
-                * whether in or out of transaction control.  For example, if we
-                * reported a nextval() result to the client, this ensures that any
-                * XLOG record generated by nextval will hit the disk before we report
-                * the transaction committed.
+                * Synchronous commit case.
+                *
+                * Sleep before flush! So we can flush more than one commit records
+                * per single fsync.  (The idea is some other backend may do the
+                * XLogFlush while we're sleeping.  This needs work still, because on
+                * most Unixen, the minimum select() delay is 10msec or more, which is
+                * way too long.)
                 *
-                * Note: if we generated a commit record above, MyXactMadeXLogEntry
-                * will certainly be set now.
+                * We do not sleep if enableFsync is not turned on, nor if there are
+                * fewer than CommitSiblings other backends with active transactions.
                 */
-               if (MyXactMadeXLogEntry)
-               {
-                       /*
-                        * Sleep before flush! So we can flush more than one commit
-                        * records per single fsync.  (The idea is some other backend may
-                        * do the XLogFlush while we're sleeping.  This needs work still,
-                        * because on most Unixen, the minimum select() delay is 10msec or
-                        * more, which is way too long.)
-                        *
-                        * We do not sleep if enableFsync is not turned on, nor if there
-                        * are fewer than CommitSiblings other backends with active
-                        * transactions.
-                        */
-                       if (CommitDelay > 0 && enableFsync &&
-                               CountActiveBackends() >= CommitSiblings)
-                               pg_usleep(CommitDelay);
+               if (CommitDelay > 0 && enableFsync &&
+                       CountActiveBackends() >= CommitSiblings)
+                       pg_usleep(CommitDelay);
 
-                       XLogFlush(recptr);
-               }
+               XLogFlush(XactLastRecEnd);
 
                /*
-                * We must mark the transaction committed in clog if its XID appears
-                * either in permanent rels or in local temporary rels. We test this
-                * by seeing if we made transaction-controlled entries *OR* local-rel
-                * tuple updates.  Note that if we made only the latter, we have not
-                * emitted an XLOG record for our commit, and so in the event of a
-                * crash the clog update might be lost.  This is okay because no one
-                * else will ever care whether we committed.
+                * Now we may update the CLOG, if we wrote a COMMIT record above
                 */
-               if (madeTCentries || MyXactMadeTempRelUpdate)
+               if (markXidCommitted)
                {
                        TransactionIdCommit(xid);
                        /* to avoid race conditions, the parent must commit first */
                        TransactionIdCommitTree(nchildren, children);
                }
+       }
+       else
+       {
+               /*
+                * Asynchronous commit case.
+                *
+                * Report the latest async commit LSN, so that the WAL writer knows to
+                * flush this commit.
+                */
+               XLogSetAsyncCommitLSN(XactLastRecEnd);
 
-               /* Unlock checkpoint lock if we acquired it */
-               if (madeTCentries)
-                       LWLockRelease(CheckpointStartLock);
+               /*
+                * We must not immediately update the CLOG, since we didn't flush the
+                * XLOG. Instead, we store the LSN up to which the XLOG must be
+                * flushed before the CLOG may be updated.
+                */
+               if (markXidCommitted)
+               {
+                       TransactionIdAsyncCommit(xid, XactLastRecEnd);
+                       /* to avoid race conditions, the parent must commit first */
+                       TransactionIdAsyncCommitTree(nchildren, children, XactLastRecEnd);
+               }
+       }
 
+       /*
+        * If we entered a commit critical section, leave it now, and let
+        * checkpoints proceed.
+        */
+       if (markXidCommitted)
+       {
+               MyProc->inCommit = false;
                END_CRIT_SECTION();
        }
 
-       /* Break the chain of back-links in the XLOG records I output */
-       MyLastRecPtr.xrecoff = 0;
-       MyXactMadeXLogEntry = false;
-       MyXactMadeTempRelUpdate = false;
+       /* Compute latestXid while we have the child XIDs handy */
+       latestXid = TransactionIdLatest(xid, nchildren, children);
 
-       /* And clean up local data */
+       /* Reset XactLastRecEnd until the next transaction writes something */
+       XactLastRecEnd.xrecoff = 0;
+
+cleanup:
+       /* Clean up local data */
        if (rels)
                pfree(rels);
-       if (children)
-               pfree(children);
+
+       return latestXid;
 }
 
 
@@ -884,34 +1082,79 @@ static void
 AtSubCommit_childXids(void)
 {
        TransactionState s = CurrentTransactionState;
-       MemoryContext old_cxt;
+       int                     new_nChildXids;
 
        Assert(s->parent != NULL);
 
        /*
-        * We keep the child-XID lists in TopTransactionContext; this avoids
-        * setting up child-transaction contexts for what might be just a few
-        * bytes of grandchild XIDs.
+        * The parent childXids array will need to hold my XID and all my
+        * childXids, in addition to the XIDs already there.
         */
-       old_cxt = MemoryContextSwitchTo(TopTransactionContext);
+       new_nChildXids = s->parent->nChildXids + s->nChildXids + 1;
 
-       s->parent->childXids = lappend_xid(s->parent->childXids,
-                                                                          s->transactionId);
-
-       if (s->childXids != NIL)
+       /* Allocate or enlarge the parent array if necessary */
+       if (s->parent->maxChildXids < new_nChildXids)
        {
-               s->parent->childXids = list_concat(s->parent->childXids,
-                                                                                  s->childXids);
+               int                             new_maxChildXids;
+               TransactionId  *new_childXids;
 
                /*
-                * list_concat doesn't free the list header for the second list; do so
-                * here to avoid memory leakage (kluge)
+                * Make it 2x what's needed right now, to avoid having to enlarge it
+                * repeatedly. But we can't go above MaxAllocSize.  (The latter
+                * limit is what ensures that we don't need to worry about integer
+                * overflow here or in the calculation of new_nChildXids.)
                 */
-               pfree(s->childXids);
-               s->childXids = NIL;
+               new_maxChildXids = Min(new_nChildXids * 2,
+                                                          (int) (MaxAllocSize / sizeof(TransactionId)));
+
+               if (new_maxChildXids < new_nChildXids)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                        errmsg("maximum number of committed subtransactions (%d) exceeded",
+                                                       (int) (MaxAllocSize / sizeof(TransactionId)))));
+
+               /*
+                * We keep the child-XID arrays in TopTransactionContext; this avoids
+                * setting up child-transaction contexts for what might be just a few
+                * bytes of grandchild XIDs.
+                */
+               if (s->parent->childXids == NULL)
+                       new_childXids =
+                               MemoryContextAlloc(TopTransactionContext, 
+                                                                  new_maxChildXids * sizeof(TransactionId));
+               else
+                       new_childXids = repalloc(s->parent->childXids, 
+                                                                        new_maxChildXids * sizeof(TransactionId));
+
+               s->parent->childXids  = new_childXids;
+               s->parent->maxChildXids = new_maxChildXids;
        }
 
-       MemoryContextSwitchTo(old_cxt);
+       /*
+        * Copy all my XIDs to parent's array.
+        *
+        * Note: We rely on the fact that the XID of a child always follows that
+        * of its parent.  By copying the XID of this subtransaction before the
+        * XIDs of its children, we ensure that the array stays ordered.  Likewise,
+        * all XIDs already in the array belong to subtransactions started and
+        * subcommitted before us, so their XIDs must precede ours.
+        */
+       s->parent->childXids[s->parent->nChildXids] = s->transactionId;
+
+       if (s->nChildXids > 0)
+               memcpy(&s->parent->childXids[s->parent->nChildXids + 1],
+                          s->childXids,
+                          s->nChildXids * sizeof(TransactionId));
+
+       s->parent->nChildXids = new_nChildXids;
+
+       /* Release child's array to avoid leakage */
+       if (s->childXids != NULL)
+               pfree(s->childXids);
+       /* We must reset these to avoid double-free if fail later in commit */
+       s->childXids = NULL;
+       s->nChildXids = 0;
+       s->maxChildXids = 0;
 }
 
 /*
@@ -920,23 +1163,20 @@ AtSubCommit_childXids(void)
 static void
 RecordSubTransactionCommit(void)
 {
+       TransactionId xid = GetCurrentTransactionIdIfAny();
+
        /*
         * We do not log the subcommit in XLOG; it doesn't matter until the
         * top-level transaction commits.
         *
-        * We must mark the subtransaction subcommitted in clog if its XID appears
-        * either in permanent rels or in local temporary rels. We test this by
-        * seeing if we made transaction-controlled entries *OR* local-rel tuple
-        * updates.  (The test here actually covers the entire transaction tree so
-        * far, so it may mark subtransactions that don't really need it, but it's
-        * probably not worth being tenser. Note that if a prior subtransaction
-        * dirtied these variables, then RecordTransactionCommit will have to do
-        * the full pushup anyway...)
+        * We must mark the subtransaction subcommitted in the CLOG if it had a
+        * valid XID assigned.  If it did not, nobody else will ever know about
+        * the existence of this subxact.  We don't have to deal with deletions
+        * scheduled for on-commit here, since they'll be reassigned to our parent
+        * (who might still abort).
         */
-       if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate)
+       if (TransactionIdIsValid(xid))
        {
-               TransactionId xid = GetCurrentTransactionId();
-
                /* XXX does this really need to be a critical section? */
                START_CRIT_SECTION();
 
@@ -954,115 +1194,133 @@ RecordSubTransactionCommit(void)
 
 /*
  *     RecordTransactionAbort
+ *
+ * Returns latest XID among xact and its children, or InvalidTransactionId
+ * if the xact has no XID.     (We compute that here just because it's easier.)
  */
-static void
-RecordTransactionAbort(void)
+static TransactionId
+RecordTransactionAbort(bool isSubXact)
 {
+       TransactionId xid = GetCurrentTransactionIdIfAny();
+       TransactionId latestXid;
        int                     nrels;
        RelFileNode *rels;
        int                     nchildren;
        TransactionId *children;
-
-       /* Get data needed for abort record */
-       nrels = smgrGetPendingDeletes(false, &rels);
-       nchildren = xactGetCommittedChildren(&children);
+       XLogRecData rdata[3];
+       int                     lastrdata = 0;
+       xl_xact_abort xlrec;
 
        /*
-        * If we made neither any transaction-controlled XLOG entries nor any
-        * temp-rel updates, and are not going to delete any files, we can omit
-        * recording the transaction abort at all.      No one will ever care that it
-        * aborted.  (These tests cover our whole transaction tree.)
+        * If we haven't been assigned an XID, nobody will care whether we aborted
+        * or not.      Hence, we're done in that case.  It does not matter if we have
+        * rels to delete (note that this routine is not responsible for actually
+        * deleting 'em).  We cannot have any child XIDs, either.
         */
-       if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate || nrels > 0)
+       if (!TransactionIdIsValid(xid))
        {
-               TransactionId xid = GetCurrentTransactionId();
+               /* Reset XactLastRecEnd until the next transaction writes something */
+               if (!isSubXact)
+                       XactLastRecEnd.xrecoff = 0;
+               return InvalidTransactionId;
+       }
 
-               /*
-                * Catch the scenario where we aborted partway through
-                * RecordTransactionCommit ...
-                */
-               if (TransactionIdDidCommit(xid))
-                       elog(PANIC, "cannot abort transaction %u, it was already committed", xid);
+       /*
+        * We have a valid XID, so we should write an ABORT record for it.
+        *
+        * We do not flush XLOG to disk here, since the default assumption after a
+        * crash would be that we aborted, anyway.      For the same reason, we don't
+        * need to worry about interlocking against checkpoint start.
+        */
 
-               START_CRIT_SECTION();
+       /*
+        * Check that we haven't aborted halfway through RecordTransactionCommit.
+        */
+       if (TransactionIdDidCommit(xid))
+               elog(PANIC, "cannot abort transaction %u, it was already committed",
+                        xid);
 
-               /*
-                * We only need to log the abort in XLOG if the transaction made any
-                * transaction-controlled XLOG entries or will delete files. (If it
-                * made no transaction-controlled XLOG entries, its XID appears
-                * nowhere in permanent storage, so no one else will ever care if it
-                * committed.)
-                *
-                * We do not flush XLOG to disk unless deleting files, since the
-                * default assumption after a crash would be that we aborted, anyway.
-                * For the same reason, we don't need to worry about interlocking
-                * against checkpoint start.
-                */
-               if (MyLastRecPtr.xrecoff != 0 || nrels > 0)
-               {
-                       XLogRecData rdata[3];
-                       int                     lastrdata = 0;
-                       xl_xact_abort xlrec;
-                       XLogRecPtr      recptr;
-
-                       xlrec.xtime = time(NULL);
-                       xlrec.nrels = nrels;
-                       xlrec.nsubxacts = nchildren;
-                       rdata[0].data = (char *) (&xlrec);
-                       rdata[0].len = MinSizeOfXactAbort;
-                       rdata[0].buffer = InvalidBuffer;
-                       /* dump rels to delete */
-                       if (nrels > 0)
-                       {
-                               rdata[0].next = &(rdata[1]);
-                               rdata[1].data = (char *) rels;
-                               rdata[1].len = nrels * sizeof(RelFileNode);
-                               rdata[1].buffer = InvalidBuffer;
-                               lastrdata = 1;
-                       }
-                       /* dump committed child Xids */
-                       if (nchildren > 0)
-                       {
-                               rdata[lastrdata].next = &(rdata[2]);
-                               rdata[2].data = (char *) children;
-                               rdata[2].len = nchildren * sizeof(TransactionId);
-                               rdata[2].buffer = InvalidBuffer;
-                               lastrdata = 2;
-                       }
-                       rdata[lastrdata].next = NULL;
+       /* Fetch the data we need for the abort record */
+       nrels = smgrGetPendingDeletes(false, &rels, NULL);
+       nchildren = xactGetCommittedChildren(&children);
 
-                       recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT, rdata);
+       /* XXX do we really need a critical section here? */
+       START_CRIT_SECTION();
 
-                       /* Must flush if we are deleting files... */
-                       if (nrels > 0)
-                               XLogFlush(recptr);
-               }
+       /* Write the ABORT record */
+       if (isSubXact)
+               xlrec.xact_time = GetCurrentTimestamp();
+       else
+       {
+               SetCurrentTransactionStopTimestamp();
+               xlrec.xact_time = xactStopTimestamp;
+       }
+       xlrec.nrels = nrels;
+       xlrec.nsubxacts = nchildren;
+       rdata[0].data = (char *) (&xlrec);
+       rdata[0].len = MinSizeOfXactAbort;
+       rdata[0].buffer = InvalidBuffer;
+       /* dump rels to delete */
+       if (nrels > 0)
+       {
+               rdata[0].next = &(rdata[1]);
+               rdata[1].data = (char *) rels;
+               rdata[1].len = nrels * sizeof(RelFileNode);
+               rdata[1].buffer = InvalidBuffer;
+               lastrdata = 1;
+       }
+       /* dump committed child Xids */
+       if (nchildren > 0)
+       {
+               rdata[lastrdata].next = &(rdata[2]);
+               rdata[2].data = (char *) children;
+               rdata[2].len = nchildren * sizeof(TransactionId);
+               rdata[2].buffer = InvalidBuffer;
+               lastrdata = 2;
+       }
+       rdata[lastrdata].next = NULL;
 
-               /*
-                * Mark the transaction aborted in clog.  This is not absolutely
-                * necessary but we may as well do it while we are here.
-                *
-                * The ordering here isn't critical but it seems best to mark the
-                * parent first.  This assures an atomic transition of all the
-                * subtransactions to aborted state from the point of view of
-                * concurrent TransactionIdDidAbort calls.
-                */
-               TransactionIdAbort(xid);
-               TransactionIdAbortTree(nchildren, children);
+       (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT, rdata);
 
-               END_CRIT_SECTION();
-       }
+       /*
+        * Mark the transaction aborted in clog.  This is not absolutely necessary
+        * but we may as well do it while we are here; also, in the subxact case
+        * it is helpful because XactLockTableWait makes use of it to avoid
+        * waiting for already-aborted subtransactions.  It is OK to do it without
+        * having flushed the ABORT record to disk, because in event of a crash
+        * we'd be assumed to have aborted anyway.
+        *
+        * The ordering here isn't critical but it seems best to mark the parent
+        * first.  This assures an atomic transition of all the subtransactions to
+        * aborted state from the point of view of concurrent
+        * TransactionIdDidAbort calls.
+        */
+       TransactionIdAbort(xid);
+       TransactionIdAbortTree(nchildren, children);
+
+       END_CRIT_SECTION();
 
-       /* Break the chain of back-links in the XLOG records I output */
-       MyLastRecPtr.xrecoff = 0;
-       MyXactMadeXLogEntry = false;
-       MyXactMadeTempRelUpdate = false;
+       /* Compute latestXid while we have the child XIDs handy */
+       latestXid = TransactionIdLatest(xid, nchildren, children);
+
+       /*
+        * If we're aborting a subtransaction, we can immediately remove failed
+        * XIDs from PGPROC's cache of running child XIDs.  We do that here for
+        * subxacts, because we already have the child XID array at hand.  For
+        * main xacts, the equivalent happens just after this function returns.
+        */
+       if (isSubXact)
+               XidCacheRemoveRunningXids(xid, nchildren, children, latestXid);
+
+       /* Reset XactLastRecEnd until the next transaction writes something */
+       if (!isSubXact)
+               XactLastRecEnd.xrecoff = 0;
 
        /* And clean up local data */
        if (rels)
                pfree(rels);
-       if (children)
-               pfree(children);
+
+       return latestXid;
 }
 
 /*
@@ -1072,20 +1330,15 @@ static void
 AtAbort_Memory(void)
 {
        /*
-        * Make sure we are in a valid context (not a child of
-        * TopTransactionContext...).  Note that it is possible for this code to
-        * be called when we aren't in a transaction at all; go directly to
-        * TopMemoryContext in that case.
+        * Switch into TransactionAbortContext, which should have some free space
+        * even if nothing else does.  We'll work in this context until we've
+        * finished cleaning up.
+        *
+        * It is barely possible to get here when we've not been able to create
+        * TransactionAbortContext yet; if so use TopMemoryContext.
         */
-       if (TopTransactionContext != NULL)
-       {
-               MemoryContextSwitchTo(TopTransactionContext);
-
-               /*
-                * We do not want to destroy the transaction's global state yet, so we
-                * can't free any memory here.
-                */
-       }
+       if (TransactionAbortContext != NULL)
+               MemoryContextSwitchTo(TransactionAbortContext);
        else
                MemoryContextSwitchTo(TopMemoryContext);
 }
@@ -1096,9 +1349,9 @@ AtAbort_Memory(void)
 static void
 AtSubAbort_Memory(void)
 {
-       Assert(TopTransactionContext != NULL);
+       Assert(TransactionAbortContext != NULL);
 
-       MemoryContextSwitchTo(TopTransactionContext);
+       MemoryContextSwitchTo(TransactionAbortContext);
 }
 
 
@@ -1137,114 +1390,15 @@ AtSubAbort_childXids(void)
        TransactionState s = CurrentTransactionState;
 
        /*
-        * We keep the child-XID lists in TopTransactionContext (see
-        * AtSubCommit_childXids).      This means we'd better free the list
+        * We keep the child-XID arrays in TopTransactionContext (see
+        * AtSubCommit_childXids).      This means we'd better free the array
         * explicitly at abort to avoid leakage.
         */
-       list_free(s->childXids);
-       s->childXids = NIL;
-}
-
-/*
- * RecordSubTransactionAbort
- */
-static void
-RecordSubTransactionAbort(void)
-{
-       int                     nrels;
-       RelFileNode *rels;
-       TransactionId xid = GetCurrentTransactionId();
-       int                     nchildren;
-       TransactionId *children;
-
-       /* Get data needed for abort record */
-       nrels = smgrGetPendingDeletes(false, &rels);
-       nchildren = xactGetCommittedChildren(&children);
-
-       /*
-        * If we made neither any transaction-controlled XLOG entries nor any
-        * temp-rel updates, and are not going to delete any files, we can omit
-        * recording the transaction abort at all.      No one will ever care that it
-        * aborted.  (These tests cover our whole transaction tree, and therefore
-        * may mark subxacts that don't really need it, but it's probably not
-        * worth being tenser.)
-        *
-        * In this case we needn't worry about marking subcommitted children as
-        * aborted, because they didn't mark themselves as subcommitted in the
-        * first place; see the optimization in RecordSubTransactionCommit.
-        */
-       if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate || nrels > 0)
-       {
-               START_CRIT_SECTION();
-
-               /*
-                * We only need to log the abort in XLOG if the transaction made any
-                * transaction-controlled XLOG entries or will delete files.
-                */
-               if (MyLastRecPtr.xrecoff != 0 || nrels > 0)
-               {
-                       XLogRecData rdata[3];
-                       int                     lastrdata = 0;
-                       xl_xact_abort xlrec;
-                       XLogRecPtr      recptr;
-
-                       xlrec.xtime = time(NULL);
-                       xlrec.nrels = nrels;
-                       xlrec.nsubxacts = nchildren;
-                       rdata[0].data = (char *) (&xlrec);
-                       rdata[0].len = MinSizeOfXactAbort;
-                       rdata[0].buffer = InvalidBuffer;
-                       /* dump rels to delete */
-                       if (nrels > 0)
-                       {
-                               rdata[0].next = &(rdata[1]);
-                               rdata[1].data = (char *) rels;
-                               rdata[1].len = nrels * sizeof(RelFileNode);
-                               rdata[1].buffer = InvalidBuffer;
-                               lastrdata = 1;
-                       }
-                       /* dump committed child Xids */
-                       if (nchildren > 0)
-                       {
-                               rdata[lastrdata].next = &(rdata[2]);
-                               rdata[2].data = (char *) children;
-                               rdata[2].len = nchildren * sizeof(TransactionId);
-                               rdata[2].buffer = InvalidBuffer;
-                               lastrdata = 2;
-                       }
-                       rdata[lastrdata].next = NULL;
-
-                       recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT, rdata);
-
-                       /* Must flush if we are deleting files... */
-                       if (nrels > 0)
-                               XLogFlush(recptr);
-               }
-
-               /*
-                * Mark the transaction aborted in clog.  This is not absolutely
-                * necessary but XactLockTableWait makes use of it to avoid waiting
-                * for already-aborted subtransactions.
-                */
-               TransactionIdAbort(xid);
-               TransactionIdAbortTree(nchildren, children);
-
-               END_CRIT_SECTION();
-       }
-
-       /*
-        * We can immediately remove failed XIDs from PGPROC's cache of running
-        * child XIDs. It's easiest to do it here while we have the child XID
-        * array at hand, even though in the main-transaction case the equivalent
-        * work happens just after return from RecordTransactionAbort.
-        */
-       XidCacheRemoveRunningXids(xid, nchildren, children);
-
-       /* And clean up local data */
-       if (rels)
-               pfree(rels);
-       if (children)
-               pfree(children);
+       if (s->childXids != NULL)
+               pfree(s->childXids);
+       s->childXids = NULL;
+       s->nChildXids = 0;
+       s->maxChildXids = 0;
 }
 
 /* ----------------------------------------------------------------
@@ -1258,13 +1412,19 @@ RecordSubTransactionAbort(void)
 static void
 AtCleanup_Memory(void)
 {
+       Assert(CurrentTransactionState->parent == NULL);
+
        /*
         * Now that we're "out" of a transaction, have the system allocate things
         * in the top memory context instead of per-transaction contexts.
         */
        MemoryContextSwitchTo(TopMemoryContext);
 
-       Assert(CurrentTransactionState->parent == NULL);
+       /*
+        * Clear the special abort context for next time.
+        */
+       if (TransactionAbortContext != NULL)
+               MemoryContextResetAndDeleteChildren(TransactionAbortContext);
 
        /*
         * Release all transaction-local memory.
@@ -1296,6 +1456,12 @@ AtSubCleanup_Memory(void)
        MemoryContextSwitchTo(s->parent->curTransactionContext);
        CurTransactionContext = s->parent->curTransactionContext;
 
+       /*
+        * Clear the special abort context for next time.
+        */
+       if (TransactionAbortContext != NULL)
+               MemoryContextResetAndDeleteChildren(TransactionAbortContext);
+
        /*
         * Delete the subxact local memory contexts. Its CurTransactionContext can
         * go too (note this also kills CurTransactionContexts from any children
@@ -1318,6 +1484,7 @@ static void
 StartTransaction(void)
 {
        TransactionState s;
+       VirtualTransactionId vxid;
 
        /*
         * Let's just make sure the state stack is empty
@@ -1345,6 +1512,8 @@ StartTransaction(void)
        FreeXactSnapshot();
        XactIsoLevel = DefaultXactIsoLevel;
        XactReadOnly = DefaultXactReadOnly;
+       forceSyncCommit = false;
+       MyXactAccessedTempRel = false;
 
        /*
         * reinitialize within-transaction counters
@@ -1352,6 +1521,7 @@ StartTransaction(void)
        s->subTransactionId = TopSubTransactionId;
        currentSubTransactionId = TopSubTransactionId;
        currentCommandId = FirstCommandId;
+       currentCommandIdUsed = false;
 
        /*
         * must initialize resource-management stuff first
@@ -1360,35 +1530,54 @@ StartTransaction(void)
        AtStart_ResourceOwner();
 
        /*
-        * generate a new transaction id
+        * Assign a new LocalTransactionId, and combine it with the backendId to
+        * form a virtual transaction id.
         */
-       s->transactionId = GetNewTransactionId(false);
+       vxid.backendId = MyBackendId;
+       vxid.localTransactionId = GetNextLocalTransactionId();
 
-       XactLockTableInsert(s->transactionId);
+       /*
+        * Lock the virtual transaction id before we announce it in the proc array
+        */
+       VirtualXactLockTableInsert(vxid);
 
        /*
-        * set now()
+        * Advertise it in the proc array.      We assume assignment of
+        * LocalTransactionID is atomic, and the backendId should be set already.
         */
-       xactStartTimestamp = GetCurrentTimestamp();
+       Assert(MyProc->backendId == vxid.backendId);
+       MyProc->lxid = vxid.localTransactionId;
+
+       TRACE_POSTGRESQL_TRANSACTION_START(vxid.localTransactionId);
 
        /*
-        * initialize current transaction state fields
+        * set transaction_timestamp() (a/k/a now()).  We want this to be the same
+        * as the first command's statement_timestamp(), so don't do a fresh
+        * GetCurrentTimestamp() call (which'd be expensive anyway).  Also, mark
+        * xactStopTimestamp as unset.
         */
-       s->nestingLevel = 1;
-       s->childXids = NIL;
+       xactStartTimestamp = stmtStartTimestamp;
+       xactStopTimestamp = 0;
+       pgstat_report_xact_timestamp(xactStartTimestamp);
 
        /*
-        * You might expect to see "s->currentUser = GetUserId();" here, but you
-        * won't because it doesn't work during startup; the userid isn't set yet
-        * during a backend's first transaction start.  We only use the
-        * currentUser field in sub-transaction state structs.
+        * initialize current transaction state fields
         *
-        * prevXactReadOnly is also valid only in sub-transactions.
+        * note: prevXactReadOnly is not used at the outermost level
         */
+       s->nestingLevel = 1;
+       s->gucNestLevel = 1;
+       s->childXids = NULL;
+       s->nChildXids = 0;
+       s->maxChildXids = 0;
+       GetUserIdAndContext(&s->prevUser, &s->prevSecDefCxt);
+       /* SecurityDefinerContext should never be set outside a transaction */
+       Assert(!s->prevSecDefCxt);
 
        /*
         * initialize other subsystems for new transaction
         */
+       AtStart_GUC();
        AtStart_Inval();
        AtStart_Cache();
        AfterTriggerBeginXact();
@@ -1412,6 +1601,7 @@ static void
 CommitTransaction(void)
 {
        TransactionState s = CurrentTransactionState;
+       TransactionId latestXid;
 
        ShowTransactionState("CommitTransaction");
 
@@ -1483,37 +1673,16 @@ CommitTransaction(void)
        /*
         * Here is where we really truly commit.
         */
-       RecordTransactionCommit();
+       latestXid = RecordTransactionCommit();
 
-       /*----------
-        * Let others know about no transaction in progress by me. Note that
-        * this must be done _before_ releasing locks we hold and _after_
+       TRACE_POSTGRESQL_TRANSACTION_COMMIT(MyProc->lxid);
+
+       /*
+        * Let others know about no transaction in progress by me. Note that this
+        * must be done _before_ releasing locks we hold and _after_
         * RecordTransactionCommit.
-        *
-        * LWLockAcquire(ProcArrayLock) is required; consider this example:
-        *              UPDATE with xid 0 is blocked by xid 1's UPDATE.
-        *              xid 1 is doing commit while xid 2 gets snapshot.
-        * If xid 2's GetSnapshotData sees xid 1 as running then it must see
-        * xid 0 as running as well, or it will be able to see two tuple versions
-        * - one deleted by xid 1 and one inserted by xid 0.  See notes in
-        * GetSnapshotData.
-        *
-        * Note: MyProc may be null during bootstrap.
-        *----------
         */
-       if (MyProc != NULL)
-       {
-               /* Lock ProcArrayLock because that's what GetSnapshotData uses. */
-               LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-               MyProc->xid = InvalidTransactionId;
-               MyProc->xmin = InvalidTransactionId;
-
-               /* Clear the subtransaction-XID cache too while holding the lock */
-               MyProc->subxids.nxids = 0;
-               MyProc->subxids.overflowed = false;
-
-               LWLockRelease(ProcArrayLock);
-       }
+       ProcArrayEndTransaction(MyProc, latestXid);
 
        /*
         * This is all post-commit cleanup.  Note that if an error is raised here,
@@ -1572,13 +1741,17 @@ CommitTransaction(void)
        /* Check we've released all catcache entries */
        AtEOXact_CatCache(true);
 
-       AtEOXact_GUC(true, false);
+       AtEOXact_GUC(true, 1);
        AtEOXact_SPI(true);
+       AtEOXact_xml();
        AtEOXact_on_commit_actions(true);
        AtEOXact_Namespace(true);
        /* smgrcommit already done */
        AtEOXact_Files();
-       pgstat_count_xact_commit();
+       AtEOXact_ComboCid();
+       AtEOXact_HashTables(true);
+       AtEOXact_PgStat(true);
+       pgstat_report_xact_timestamp(0);
 
        CurrentResourceOwner = NULL;
        ResourceOwnerDelete(TopTransactionResourceOwner);
@@ -1591,7 +1764,10 @@ CommitTransaction(void)
        s->transactionId = InvalidTransactionId;
        s->subTransactionId = InvalidSubTransactionId;
        s->nestingLevel = 0;
-       s->childXids = NIL;
+       s->gucNestLevel = 0;
+       s->childXids = NULL;
+       s->nChildXids = 0;
+       s->maxChildXids = 0;
 
        /*
         * done with commit processing, set current transaction state back to
@@ -1667,6 +1843,26 @@ PrepareTransaction(void)
 
        /* NOTIFY and flatfiles will be handled below */
 
+       /*
+        * Don't allow PREPARE TRANSACTION if we've accessed a temporary table
+        * in this transaction.  Having the prepared xact hold locks on another
+        * backend's temp table seems a bad idea --- for instance it would prevent
+        * the backend from exiting.  There are other problems too, such as how
+        * to clean up the source backend's local buffers and ON COMMIT state
+        * if the prepared xact includes a DROP of a temp table.
+        *
+        * We must check this after executing any ON COMMIT actions, because
+        * they might still access a temp relation.
+        *
+        * XXX In principle this could be relaxed to allow some useful special
+        * cases, such as a temp table created and dropped all within the
+        * transaction.  That seems to require much more bookkeeping though.
+        */
+       if (MyXactAccessedTempRel)
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("cannot PREPARE a transaction that has operated on temporary tables")));
+
        /* Prevent cancel/die interrupt while cleaning up */
        HOLD_INTERRUPTS();
 
@@ -1711,6 +1907,7 @@ PrepareTransaction(void)
        AtPrepare_UpdateFlatFiles();
        AtPrepare_Inval();
        AtPrepare_Locks();
+       AtPrepare_PgStat();
 
        /*
         * Here is where we really truly prepare.
@@ -1725,27 +1922,15 @@ PrepareTransaction(void)
         * Now we clean up backend-internal state and release internal resources.
         */
 
-       /* Break the chain of back-links in the XLOG records I output */
-       MyLastRecPtr.xrecoff = 0;
-       MyXactMadeXLogEntry = false;
-       MyXactMadeTempRelUpdate = false;
+       /* Reset XactLastRecEnd until the next transaction writes something */
+       XactLastRecEnd.xrecoff = 0;
 
        /*
         * Let others know about no transaction in progress by me.      This has to be
         * done *after* the prepared transaction has been marked valid, else
         * someone may think it is unlocked and recyclable.
         */
-
-       /* Lock ProcArrayLock because that's what GetSnapshotData uses. */
-       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-       MyProc->xid = InvalidTransactionId;
-       MyProc->xmin = InvalidTransactionId;
-
-       /* Clear the subtransaction-XID cache too while holding the lock */
-       MyProc->subxids.nxids = 0;
-       MyProc->subxids.overflowed = false;
-
-       LWLockRelease(ProcArrayLock);
+       ProcArrayClearTransaction(MyProc);
 
        /*
         * This is all post-transaction cleanup.  Note that if an error is raised
@@ -1767,6 +1952,8 @@ PrepareTransaction(void)
 
        /* notify and flatfiles don't need a postprepare call */
 
+       PostPrepare_PgStat();
+
        PostPrepare_Inval();
 
        PostPrepare_smgr();
@@ -1786,12 +1973,16 @@ PrepareTransaction(void)
        AtEOXact_CatCache(true);
 
        /* PREPARE acts the same as COMMIT as far as GUC is concerned */
-       AtEOXact_GUC(true, false);
+       AtEOXact_GUC(true, 1);
        AtEOXact_SPI(true);
+       AtEOXact_xml();
        AtEOXact_on_commit_actions(true);
        AtEOXact_Namespace(true);
        /* smgrcommit already done */
        AtEOXact_Files();
+       AtEOXact_ComboCid();
+       AtEOXact_HashTables(true);
+       /* don't call AtEOXact_PgStat here */
 
        CurrentResourceOwner = NULL;
        ResourceOwnerDelete(TopTransactionResourceOwner);
@@ -1804,7 +1995,10 @@ PrepareTransaction(void)
        s->transactionId = InvalidTransactionId;
        s->subTransactionId = InvalidSubTransactionId;
        s->nestingLevel = 0;
-       s->childXids = NIL;
+       s->gucNestLevel = 0;
+       s->childXids = NULL;
+       s->nChildXids = 0;
+       s->maxChildXids = 0;
 
        /*
         * done with 1st phase commit processing, set current transaction state
@@ -1823,10 +2017,15 @@ static void
 AbortTransaction(void)
 {
        TransactionState s = CurrentTransactionState;
+       TransactionId latestXid;
 
        /* Prevent cancel/die interrupt while cleaning up */
        HOLD_INTERRUPTS();
 
+       /* Make sure we have a valid memory context and resource owner */
+       AtAbort_Memory();
+       AtAbort_ResourceOwner();
+
        /*
         * Release any LW locks we might be holding as quickly as possible.
         * (Regular locks, however, must be held till we finish aborting.)
@@ -1859,22 +2058,17 @@ AbortTransaction(void)
         */
        s->state = TRANS_ABORT;
 
-       /* Make sure we have a valid memory context and resource owner */
-       AtAbort_Memory();
-       AtAbort_ResourceOwner();
-
        /*
-        * Reset user id which might have been changed transiently.  We cannot use
-        * s->currentUser, since it may not be set yet; instead rely on internal
-        * state of miscinit.c.
+        * Reset user ID which might have been changed transiently.  We need this
+        * to clean up in case control escaped out of a SECURITY DEFINER function
+        * or other local change of CurrentUserId; therefore, the prior value
+        * of SecurityDefinerContext also needs to be restored.
         *
-        * (Note: it is not necessary to restore session authorization here
-        * because that can only be changed via GUC, and GUC will take care of
-        * rolling it back if need be.  However, an error within a SECURITY
-        * DEFINER function could send control here with the wrong current
-        * userid.)
+        * (Note: it is not necessary to restore session authorization or role
+        * settings here because those can only be changed via GUC, and GUC will
+        * take care of rolling them back if need be.)
         */
-       AtAbort_UserId();
+       SetUserIdAndContext(s->prevUser, s->prevSecDefCxt);
 
        /*
         * do abort processing
@@ -1889,27 +2083,16 @@ AbortTransaction(void)
         * Advertise the fact that we aborted in pg_clog (assuming that we got as
         * far as assigning an XID to advertise).
         */
-       if (TransactionIdIsValid(s->transactionId))
-               RecordTransactionAbort();
+       latestXid = RecordTransactionAbort(false);
+
+       TRACE_POSTGRESQL_TRANSACTION_ABORT(MyProc->lxid);
 
        /*
         * Let others know about no transaction in progress by me. Note that this
         * must be done _before_ releasing locks we hold and _after_
         * RecordTransactionAbort.
         */
-       if (MyProc != NULL)
-       {
-               /* Lock ProcArrayLock because that's what GetSnapshotData uses. */
-               LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-               MyProc->xid = InvalidTransactionId;
-               MyProc->xmin = InvalidTransactionId;
-
-               /* Clear the subtransaction-XID cache too while holding the lock */
-               MyProc->subxids.nxids = 0;
-               MyProc->subxids.overflowed = false;
-
-               LWLockRelease(ProcArrayLock);
-       }
+       ProcArrayEndTransaction(MyProc, latestXid);
 
        /*
         * Post-abort cleanup.  See notes in CommitTransaction() concerning
@@ -1934,13 +2117,17 @@ AbortTransaction(void)
                                                 false, true);
        AtEOXact_CatCache(false);
 
-       AtEOXact_GUC(false, false);
+       AtEOXact_GUC(false, 1);
        AtEOXact_SPI(false);
+       AtEOXact_xml();
        AtEOXact_on_commit_actions(false);
        AtEOXact_Namespace(false);
        smgrabort();
        AtEOXact_Files();
-       pgstat_count_xact_rollback();
+       AtEOXact_ComboCid();
+       AtEOXact_HashTables(false);
+       AtEOXact_PgStat(false);
+       pgstat_report_xact_timestamp(0);
 
        /*
         * State remains TRANS_ABORT until CleanupTransaction().
@@ -1980,7 +2167,10 @@ CleanupTransaction(void)
        s->transactionId = InvalidTransactionId;
        s->subTransactionId = InvalidSubTransactionId;
        s->nestingLevel = 0;
-       s->childXids = NIL;
+       s->gucNestLevel = 0;
+       s->childXids = NULL;
+       s->nChildXids = 0;
+       s->maxChildXids = 0;
 
        /*
         * done with abort processing, set current transaction state back to
@@ -2439,12 +2629,14 @@ AbortCurrentTransaction(void)
  *     could issue more commands and possibly cause a failure after the statement
  *     completes).  Subtransactions are verboten too.
  *
- *     stmtNode: pointer to parameter block for statement; this is used in
- *     a very klugy way to determine whether we are inside a function.
- *     stmtType: statement type name for error messages.
+ *     isTopLevel: passed down from ProcessUtility to determine whether we are
+ *     inside a function or multi-query querystring.  (We will always fail if
+ *     this is false, but it's convenient to centralize the check here instead of
+ *     making callers do it.)
+ *     stmtType: statement type name, for error messages.
  */
 void
-PreventTransactionChain(void *stmtNode, const char *stmtType)
+PreventTransactionChain(bool isTopLevel, const char *stmtType)
 {
        /*
         * xact block already started?
@@ -2467,15 +2659,14 @@ PreventTransactionChain(void *stmtNode, const char *stmtType)
                                                stmtType)));
 
        /*
-        * Are we inside a function call?  If the statement's parameter block was
-        * allocated in QueryContext, assume it is an interactive command.
-        * Otherwise assume it is coming from a function.
+        * inside a function call?
         */
-       if (!MemoryContextContains(QueryContext, stmtNode))
+       if (!isTopLevel)
                ereport(ERROR,
                                (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
                /* translator: %s represents an SQL statement name */
-                                errmsg("%s cannot be executed from a function", stmtType)));
+                                errmsg("%s cannot be executed from a function or multi-command string",
+                                               stmtType)));
 
        /* If we got past IsTransactionBlock test, should be in default state */
        if (CurrentTransactionState->blockState != TBLOCK_DEFAULT &&
@@ -2497,12 +2688,12 @@ PreventTransactionChain(void *stmtNode, const char *stmtType)
  *     use of the current statement's results.  Likewise subtransactions.
  *     Thus this is an inverse for PreventTransactionChain.
  *
- *     stmtNode: pointer to parameter block for statement; this is used in
- *     a very klugy way to determine whether we are inside a function.
- *     stmtType: statement type name for error messages.
+ *     isTopLevel: passed down from ProcessUtility to determine whether we are
+ *     inside a function.
+ *     stmtType: statement type name, for error messages.
  */
 void
-RequireTransactionChain(void *stmtNode, const char *stmtType)
+RequireTransactionChain(bool isTopLevel, const char *stmtType)
 {
        /*
         * xact block already started?
@@ -2517,16 +2708,15 @@ RequireTransactionChain(void *stmtNode, const char *stmtType)
                return;
 
        /*
-        * Are we inside a function call?  If the statement's parameter block was
-        * allocated in QueryContext, assume it is an interactive command.
-        * Otherwise assume it is coming from a function.
+        * inside a function call?
         */
-       if (!MemoryContextContains(QueryContext, stmtNode))
+       if (!isTopLevel)
                return;
+
        ereport(ERROR,
                        (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
        /* translator: %s represents an SQL statement name */
-                        errmsg("%s may only be used in transaction blocks",
+                        errmsg("%s can only be used in transaction blocks",
                                        stmtType)));
 }
 
@@ -2537,11 +2727,11 @@ RequireTransactionChain(void *stmtNode, const char *stmtType)
  *     a transaction block than when running as single commands.  ANALYZE is
  *     currently the only example.
  *
- *     stmtNode: pointer to parameter block for statement; this is used in
- *     a very klugy way to determine whether we are inside a function.
+ *     isTopLevel: passed down from ProcessUtility to determine whether we are
+ *     inside a function.
  */
 bool
-IsInTransactionChain(void *stmtNode)
+IsInTransactionChain(bool isTopLevel)
 {
        /*
         * Return true on same conditions that would make PreventTransactionChain
@@ -2553,7 +2743,7 @@ IsInTransactionChain(void *stmtNode)
        if (IsSubTransaction())
                return true;
 
-       if (!MemoryContextContains(QueryContext, stmtNode))
+       if (!isTopLevel)
                return true;
 
        if (CurrentTransactionState->blockState != TBLOCK_DEFAULT &&
@@ -2965,7 +3155,7 @@ UserAbortTransactionBlock(void)
                         * default state.
                         */
                case TBLOCK_STARTED:
-                       ereport(WARNING,
+                       ereport(NOTICE,
                                        (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
                                         errmsg("there is no transaction in progress")));
                        s->blockState = TBLOCK_ABORT_PENDING;
@@ -3250,10 +3440,11 @@ RollbackToSavepoint(List *options)
 
 /*
  * BeginInternalSubTransaction
- *             This is the same as DefineSavepoint except it allows TBLOCK_STARTED
- *             state, and therefore it can safely be used in a function that might
- *             be called when not inside a BEGIN block.  Also, we automatically
- *             cycle through CommitTransactionCommand/StartTransactionCommand
+ *             This is the same as DefineSavepoint except it allows TBLOCK_STARTED,
+ *             TBLOCK_END, and TBLOCK_PREPARE states, and therefore it can safely be
+ *             used in functions that might be called when not inside a BEGIN block
+ *             or when running deferred triggers at COMMIT/PREPARE time.  Also, it
+ *             automatically does CommitTransactionCommand/StartTransactionCommand
  *             instead of expecting the caller to do it.
  */
 void
@@ -3265,6 +3456,8 @@ BeginInternalSubTransaction(char *name)
        {
                case TBLOCK_STARTED:
                case TBLOCK_INPROGRESS:
+               case TBLOCK_END:
+               case TBLOCK_PREPARE:
                case TBLOCK_SUBINPROGRESS:
                        /* Normal subtransaction start */
                        PushTransaction();
@@ -3282,7 +3475,6 @@ BeginInternalSubTransaction(char *name)
                case TBLOCK_DEFAULT:
                case TBLOCK_BEGIN:
                case TBLOCK_SUBBEGIN:
-               case TBLOCK_END:
                case TBLOCK_SUBEND:
                case TBLOCK_ABORT:
                case TBLOCK_SUBABORT:
@@ -3292,7 +3484,6 @@ BeginInternalSubTransaction(char *name)
                case TBLOCK_SUBABORT_PENDING:
                case TBLOCK_SUBRESTART:
                case TBLOCK_SUBABORT_RESTART:
-               case TBLOCK_PREPARE:
                        elog(FATAL, "BeginInternalSubTransaction: unexpected state %s",
                                 BlockStateAsString(s->blockState));
                        break;
@@ -3600,13 +3791,11 @@ CommitSubTransaction(void)
        CommandCounterIncrement();
 
        /* Mark subtransaction as subcommitted */
-       if (TransactionIdIsValid(s->transactionId))
-       {
-               RecordSubTransactionCommit();
-               AtSubCommit_childXids();
-       }
+       RecordSubTransactionCommit();
 
        /* Post-commit cleanup */
+       if (TransactionIdIsValid(s->transactionId))
+               AtSubCommit_childXids();
        AfterTriggerEndSubXact(true);
        AtSubCommit_Portals(s->subTransactionId,
                                                s->parent->subTransactionId,
@@ -3643,7 +3832,7 @@ CommitSubTransaction(void)
                                                 RESOURCE_RELEASE_AFTER_LOCKS,
                                                 true, false);
 
-       AtEOXact_GUC(true, true);
+       AtEOXact_GUC(true, s->gucNestLevel);
        AtEOSubXact_SPI(true, s->subTransactionId);
        AtEOSubXact_on_commit_actions(true, s->subTransactionId,
                                                                  s->parent->subTransactionId);
@@ -3651,6 +3840,8 @@ CommitSubTransaction(void)
                                                  s->parent->subTransactionId);
        AtEOSubXact_Files(true, s->subTransactionId,
                                          s->parent->subTransactionId);
+       AtEOSubXact_HashTables(true, s->nestingLevel);
+       AtEOSubXact_PgStat(true, s->nestingLevel);
 
        /*
         * We need to restore the upper transaction's read-only state, in case the
@@ -3679,15 +3870,12 @@ AbortSubTransaction(void)
 {
        TransactionState s = CurrentTransactionState;
 
-       ShowTransactionState("AbortSubTransaction");
-
-       if (s->state != TRANS_INPROGRESS)
-               elog(WARNING, "AbortSubTransaction while in %s state",
-                        TransStateAsString(s->state));
-
+       /* Prevent cancel/die interrupt while cleaning up */
        HOLD_INTERRUPTS();
 
-       s->state = TRANS_ABORT;
+       /* Make sure we have a valid memory context and resource owner */
+       AtSubAbort_Memory();
+       AtSubAbort_ResourceOwner();
 
        /*
         * Release any LW locks we might be holding as quickly as possible.
@@ -3706,10 +3894,21 @@ AbortSubTransaction(void)
        LockWaitCancel();
 
        /*
-        * do abort processing
+        * check the current transaction state
         */
-       AtSubAbort_Memory();
-       AtSubAbort_ResourceOwner();
+       ShowTransactionState("AbortSubTransaction");
+
+       if (s->state != TRANS_INPROGRESS)
+               elog(WARNING, "AbortSubTransaction while in %s state",
+                        TransStateAsString(s->state));
+
+       s->state = TRANS_ABORT;
+
+       /*
+        * Reset user ID which might have been changed transiently.  (See notes
+        * in AbortTransaction.)
+        */
+       SetUserIdAndContext(s->prevUser, s->prevSecDefCxt);
 
        /*
         * We can skip all this stuff if the subxact failed before creating a
@@ -3728,13 +3927,12 @@ AbortSubTransaction(void)
                                                                        s->parent->subTransactionId);
 
                /* Advertise the fact that we aborted in pg_clog. */
+               (void) RecordTransactionAbort(true);
+
+               /* Post-abort cleanup */
                if (TransactionIdIsValid(s->transactionId))
-               {
-                       RecordSubTransactionAbort();
                        AtSubAbort_childXids();
-               }
 
-               /* Post-abort cleanup */
                CallSubXactCallbacks(SUBXACT_EVENT_ABORT_SUB, s->subTransactionId,
                                                         s->parent->subTransactionId);
 
@@ -3752,30 +3950,19 @@ AbortSubTransaction(void)
                                                         RESOURCE_RELEASE_AFTER_LOCKS,
                                                         false, false);
 
-               AtEOXact_GUC(false, true);
+               AtEOXact_GUC(false, s->gucNestLevel);
                AtEOSubXact_SPI(false, s->subTransactionId);
+               AtEOXact_xml();
                AtEOSubXact_on_commit_actions(false, s->subTransactionId,
                                                                          s->parent->subTransactionId);
                AtEOSubXact_Namespace(false, s->subTransactionId,
                                                          s->parent->subTransactionId);
                AtEOSubXact_Files(false, s->subTransactionId,
                                                  s->parent->subTransactionId);
+               AtEOSubXact_HashTables(false, s->nestingLevel);
+               AtEOSubXact_PgStat(false, s->nestingLevel);
        }
 
-       /*
-        * Reset user id which might have been changed transiently.  Here we want
-        * to restore to the userid that was current at subxact entry. (As in
-        * AbortTransaction, we need not worry about the session userid.)
-        *
-        * Must do this after AtEOXact_GUC to handle the case where we entered the
-        * subxact inside a SECURITY DEFINER function (hence current and session
-        * userids were different) and then session auth was changed inside the
-        * subxact.  GUC will reset both current and session userids to the
-        * entry-time session userid.  This is right in every other scenario so it
-        * seems simplest to let GUC do that and fix it here.
-        */
-       SetUserId(s->currentUser);
-
        /*
         * Restore the upper transaction's read-only state, too.  This should be
         * redundant with GUC's cleanup but we may as well do it for consistency
@@ -3830,13 +4017,6 @@ PushTransaction(void)
 {
        TransactionState p = CurrentTransactionState;
        TransactionState s;
-       Oid                     currentUser;
-
-       /*
-        * At present, GetUserId cannot fail, but let's not assume that.  Get the
-        * ID before entering the critical code sequence.
-        */
-       currentUser = GetUserId();
 
        /*
         * We keep subtransaction state nodes in TopTransactionContext.
@@ -3866,10 +4046,11 @@ PushTransaction(void)
        s->subTransactionId = currentSubTransactionId;
        s->parent = p;
        s->nestingLevel = p->nestingLevel + 1;
+       s->gucNestLevel = NewGUCNestLevel();
        s->savepointLevel = p->savepointLevel;
        s->state = TRANS_DEFAULT;
        s->blockState = TBLOCK_SUBBEGIN;
-       s->currentUser = currentUser;
+       GetUserIdAndContext(&s->prevUser, &s->prevSecDefCxt);
        s->prevXactReadOnly = XactReadOnly;
 
        CurrentTransactionState = s;
@@ -3939,20 +4120,35 @@ ShowTransactionState(const char *str)
 static void
 ShowTransactionStateRec(TransactionState s)
 {
+       StringInfoData buf;
+
+       initStringInfo(&buf);
+
+       if (s->nChildXids > 0)
+       {
+               int i;
+
+               appendStringInfo(&buf, "%u", s->childXids[0]);
+               for (i = 1; i < s->nChildXids; i++)
+                       appendStringInfo(&buf, " %u", s->childXids[i]);
+       }
+
        if (s->parent)
                ShowTransactionStateRec(s->parent);
 
        /* use ereport to suppress computation if msg will not be printed */
        ereport(DEBUG3,
-                       (errmsg_internal("name: %s; blockState: %13s; state: %7s, xid/subid/cid: %u/%u/%u, nestlvl: %d, children: %s",
+                       (errmsg_internal("name: %s; blockState: %13s; state: %7s, xid/subid/cid: %u/%u/%u%s, nestlvl: %d, children: %s",
                                                         PointerIsValid(s->name) ? s->name : "unnamed",
                                                         BlockStateAsString(s->blockState),
                                                         TransStateAsString(s->state),
                                                         (unsigned int) s->transactionId,
                                                         (unsigned int) s->subTransactionId,
                                                         (unsigned int) currentCommandId,
-                                                        s->nestingLevel,
-                                                        nodeToString(s->childXids))));
+                                                        currentCommandIdUsed ? " (used)" : "",
+                                                        s->nestingLevel, buf.data)));
+
+       pfree(buf.data);
 }
 
 /*
@@ -4031,36 +4227,22 @@ TransStateAsString(TransState state)
  * xactGetCommittedChildren
  *
  * Gets the list of committed children of the current transaction.     The return
- * value is the number of child transactions.  *children is set to point to a
- * palloc'd array of TransactionIds.  If there are no subxacts, *children is
- * set to NULL.
+ * value is the number of child transactions.  *ptr is set to point to an
+ * array of TransactionIds.  The array is allocated in TopTransactionContext;
+ * the caller should *not* pfree() it (this is a change from pre-8.4 code!).
+ * If there are no subxacts, *ptr is set to NULL.
  */
 int
 xactGetCommittedChildren(TransactionId **ptr)
 {
        TransactionState s = CurrentTransactionState;
-       int                     nchildren;
-       TransactionId *children;
-       ListCell   *p;
 
-       nchildren = list_length(s->childXids);
-       if (nchildren == 0)
-       {
+       if (s->nChildXids == 0)
                *ptr = NULL;
-               return 0;
-       }
-
-       children = (TransactionId *) palloc(nchildren * sizeof(TransactionId));
-       *ptr = children;
-
-       foreach(p, s->childXids)
-       {
-               TransactionId child = lfirst_xid(p);
-
-               *children++ = child;
-       }
+       else
+               *ptr = s->childXids;
 
-       return nchildren;
+       return s->nChildXids;
 }
 
 /*
@@ -4181,12 +4363,9 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
 static void
 xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
 {
-       struct tm  *tm = localtime(&xlrec->xtime);
        int                     i;
 
-       appendStringInfo(buf, "%04u-%02u-%02u %02u:%02u:%02u",
-                       tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
-                       tm->tm_hour, tm->tm_min, tm->tm_sec);
+       appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time));
        if (xlrec->nrels > 0)
        {
                appendStringInfo(buf, "; rels:");
@@ -4195,7 +4374,7 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
                        RelFileNode rnode = xlrec->xnodes[i];
 
                        appendStringInfo(buf, " %u/%u/%u",
-                                       rnode.spcNode, rnode.dbNode, rnode.relNode);
+                                                        rnode.spcNode, rnode.dbNode, rnode.relNode);
                }
        }
        if (xlrec->nsubxacts > 0)
@@ -4212,12 +4391,9 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
 static void
 xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec)
 {
-       struct tm  *tm = localtime(&xlrec->xtime);
        int                     i;
 
-       appendStringInfo(buf, "%04u-%02u-%02u %02u:%02u:%02u",
-                       tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
-                       tm->tm_hour, tm->tm_min, tm->tm_sec);
+       appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time));
        if (xlrec->nrels > 0)
        {
                appendStringInfo(buf, "; rels:");
@@ -4226,7 +4402,7 @@ xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec)
                        RelFileNode rnode = xlrec->xnodes[i];
 
                        appendStringInfo(buf, " %u/%u/%u",
-                                       rnode.spcNode, rnode.dbNode, rnode.relNode);
+                                                        rnode.spcNode, rnode.dbNode, rnode.relNode);
                }
        }
        if (xlrec->nsubxacts > 0)
@@ -4243,7 +4419,7 @@ xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec)
 void
 xact_desc(StringInfo buf, uint8 xl_info, char *rec)
 {
-       uint8                   info = xl_info & ~XLR_INFO_MASK;
+       uint8           info = xl_info & ~XLR_INFO_MASK;
 
        if (info == XLOG_XACT_COMMIT)
        {