]> granicus.if.org Git - postgresql/commitdiff
Improve concurrency of foreign key locking
authorAlvaro Herrera <alvherre@alvh.no-ip.org>
Wed, 23 Jan 2013 15:04:59 +0000 (12:04 -0300)
committerAlvaro Herrera <alvherre@alvh.no-ip.org>
Wed, 23 Jan 2013 15:04:59 +0000 (12:04 -0300)
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE".  These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE".  UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.

Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.

The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid.  Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates.  This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed.  pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.

Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header.  This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.

Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)

With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.

As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.

Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane.  There's probably room for several more tests.

There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it.  Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.

This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov

106 files changed:
contrib/file_fdw/output/file_fdw.source
contrib/pageinspect/heapfuncs.c
contrib/pg_upgrade/controldata.c
contrib/pg_upgrade/pg_upgrade.c
contrib/pg_upgrade/pg_upgrade.h
contrib/pgrowlocks/Makefile
contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql [new file with mode: 0644]
contrib/pgrowlocks/pgrowlocks--1.1.sql [moved from contrib/pgrowlocks/pgrowlocks--1.0.sql with 83% similarity]
contrib/pgrowlocks/pgrowlocks.c
contrib/pgrowlocks/pgrowlocks.control
doc/src/sgml/pgrowlocks.sgml
doc/src/sgml/ref/select.sgml
src/backend/access/common/heaptuple.c
src/backend/access/heap/README.tuplock [new file with mode: 0644]
src/backend/access/heap/heapam.c
src/backend/access/heap/pruneheap.c
src/backend/access/heap/rewriteheap.c
src/backend/access/rmgrdesc/heapdesc.c
src/backend/access/rmgrdesc/mxactdesc.c
src/backend/access/rmgrdesc/xlogdesc.c
src/backend/access/transam/README
src/backend/access/transam/multixact.c
src/backend/access/transam/varsup.c
src/backend/access/transam/xlog.c
src/backend/catalog/heap.c
src/backend/catalog/index.c
src/backend/commands/analyze.c
src/backend/commands/cluster.c
src/backend/commands/dbcommands.c
src/backend/commands/sequence.c
src/backend/commands/tablecmds.c
src/backend/commands/trigger.c
src/backend/commands/vacuum.c
src/backend/commands/vacuumlazy.c
src/backend/executor/execMain.c
src/backend/executor/nodeLockRows.c
src/backend/executor/nodeModifyTable.c
src/backend/nodes/copyfuncs.c
src/backend/nodes/equalfuncs.c
src/backend/nodes/outfuncs.c
src/backend/nodes/readfuncs.c
src/backend/optimizer/plan/initsplan.c
src/backend/optimizer/plan/planner.c
src/backend/parser/analyze.c
src/backend/parser/gram.y
src/backend/postmaster/autovacuum.c
src/backend/rewrite/rewriteHandler.c
src/backend/storage/lmgr/lock.c
src/backend/storage/lmgr/predicate.c
src/backend/tcop/utility.c
src/backend/utils/adt/ri_triggers.c
src/backend/utils/adt/ruleutils.c
src/backend/utils/cache/relcache.c
src/backend/utils/time/combocid.c
src/backend/utils/time/tqual.c
src/bin/pg_controldata/pg_controldata.c
src/bin/pg_resetxlog/pg_resetxlog.c
src/include/access/heapam.h
src/include/access/heapam_xlog.h
src/include/access/htup.h
src/include/access/htup_details.h
src/include/access/multixact.h
src/include/access/rewriteheap.h
src/include/catalog/catversion.h
src/include/catalog/pg_class.h
src/include/catalog/pg_control.h
src/include/catalog/pg_database.h
src/include/catalog/pg_proc.h
src/include/commands/cluster.h
src/include/commands/vacuum.h
src/include/executor/executor.h
src/include/nodes/execnodes.h
src/include/nodes/parsenodes.h
src/include/nodes/plannodes.h
src/include/parser/analyze.h
src/include/postgres.h
src/include/storage/lock.h
src/include/utils/builtins.h
src/include/utils/rel.h
src/include/utils/relcache.h
src/include/utils/tqual.h
src/test/isolation/expected/aborted-keyrevoke.out [new file with mode: 0644]
src/test/isolation/expected/aborted-keyrevoke_2.out [new file with mode: 0644]
src/test/isolation/expected/delete-abort-savept-2.out [new file with mode: 0644]
src/test/isolation/expected/delete-abort-savept.out [new file with mode: 0644]
src/test/isolation/expected/fk-contention.out
src/test/isolation/expected/fk-deadlock.out
src/test/isolation/expected/fk-deadlock2.out
src/test/isolation/expected/fk-deadlock2_1.out
src/test/isolation/expected/fk-deadlock2_2.out [new file with mode: 0644]
src/test/isolation/expected/fk-deadlock_1.out
src/test/isolation/expected/fk-deadlock_2.out [new file with mode: 0644]
src/test/isolation/expected/fk-delete-insert.out [new file with mode: 0644]
src/test/isolation/expected/lock-update-delete.out [new file with mode: 0644]
src/test/isolation/expected/lock-update-traversal.out [new file with mode: 0644]
src/test/isolation/expected/multixact-no-deadlock.out [new file with mode: 0644]
src/test/isolation/isolation_schedule
src/test/isolation/isolationtester.c
src/test/isolation/specs/aborted-keyrevoke.spec [new file with mode: 0644]
src/test/isolation/specs/delete-abort-savept-2.spec [new file with mode: 0644]
src/test/isolation/specs/delete-abort-savept.spec [new file with mode: 0644]
src/test/isolation/specs/fk-deadlock.spec
src/test/isolation/specs/fk-deadlock2.spec
src/test/isolation/specs/lock-update-delete.spec [new file with mode: 0644]
src/test/isolation/specs/lock-update-traversal.spec [new file with mode: 0644]
src/test/isolation/specs/multixact-no-deadlock.spec [new file with mode: 0644]

index 6f906e1fc8c8c6f8adb3ea7a089c18cd3647c1bd..c01f8d804bc50911dc3e09748774ade8dc0576db 100644 (file)
@@ -191,7 +191,7 @@ ERROR:  cannot change foreign table "agg_csv"
 DELETE FROM agg_csv WHERE a = 100;
 ERROR:  cannot change foreign table "agg_csv"
 SELECT * FROM agg_csv FOR UPDATE OF agg_csv;
-ERROR:  SELECT FOR UPDATE/SHARE cannot be used with foreign table "agg_csv"
+ERROR:  SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be used with foreign table "agg_csv"
 LINE 1: SELECT * FROM agg_csv FOR UPDATE OF agg_csv;
                                             ^
 -- but this should be ignored
index bbf796ff43514efc1fea5d5f5a246b202ae948c9..6d8f6f1c74064b0b9b47e17d295833e1a5f1b11b 100644 (file)
@@ -163,7 +163,7 @@ heap_page_items(PG_FUNCTION_ARGS)
                        tuphdr = (HeapTupleHeader) PageGetItem(page, id);
 
                        values[4] = UInt32GetDatum(HeapTupleHeaderGetXmin(tuphdr));
-                       values[5] = UInt32GetDatum(HeapTupleHeaderGetXmax(tuphdr));
+                       values[5] = UInt32GetDatum(HeapTupleHeaderGetRawXmax(tuphdr));
                        values[6] = UInt32GetDatum(HeapTupleHeaderGetRawCommandId(tuphdr)); /* shared with xvac */
                        values[7] = PointerGetDatum(&tuphdr->t_ctid);
                        values[8] = UInt32GetDatum(tuphdr->t_infomask2);
index 9218f65abc36bfc176af5f5b0da120d1ee912009..7c80c873153fbac139b37ff3ed04a3b0db6910ec 100644 (file)
@@ -40,6 +40,9 @@ get_control_data(ClusterInfo *cluster, bool live_check)
        bool            got_xid = false;
        bool            got_oid = false;
        bool            got_nextxlogfile = false;
+       bool            got_multi = false;
+       bool            got_mxoff = false;
+       bool            got_oldestmulti = false;
        bool            got_log_id = false;
        bool            got_log_seg = false;
        bool            got_tli = false;
@@ -246,6 +249,39 @@ get_control_data(ClusterInfo *cluster, bool live_check)
                        cluster->controldata.chkpnt_nxtoid = str2uint(p);
                        got_oid = true;
                }
+               else if ((p = strstr(bufin, "Latest checkpoint's NextMultiXactId:")) != NULL)
+               {
+                       p = strchr(p, ':');
+
+                       if (p == NULL || strlen(p) <= 1)
+                               pg_log(PG_FATAL, "%d: controldata retrieval problem\n", __LINE__);
+
+                       p++;                            /* removing ':' char */
+                       cluster->controldata.chkpnt_nxtmulti = str2uint(p);
+                       got_multi = true;
+               }
+               else if ((p = strstr(bufin, "Latest checkpoint's oldestMultiXid:")) != NULL)
+               {
+                       p = strchr(p, ':');
+
+                       if (p == NULL || strlen(p) <= 1)
+                               pg_log(PG_FATAL, "%d: controldata retrieval problem\n", __LINE__);
+
+                       p++;                            /* removing ':' char */
+                       cluster->controldata.chkpnt_oldstMulti = str2uint(p);
+                       got_oldestmulti = true;
+               }
+               else if ((p = strstr(bufin, "Latest checkpoint's NextMultiOffset:")) != NULL)
+               {
+                       p = strchr(p, ':');
+
+                       if (p == NULL || strlen(p) <= 1)
+                               pg_log(PG_FATAL, "%d: controldata retrieval problem\n", __LINE__);
+
+                       p++;                            /* removing ':' char */
+                       cluster->controldata.chkpnt_nxtmxoff = str2uint(p);
+                       got_mxoff = true;
+               }
                else if ((p = strstr(bufin, "Maximum data alignment:")) != NULL)
                {
                        p = strchr(p, ':');
@@ -433,6 +469,7 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 
        /* verify that we got all the mandatory pg_control data */
        if (!got_xid || !got_oid ||
+               !got_multi || !got_mxoff || !got_oldestmulti ||
                (!live_check && !got_nextxlogfile) ||
                !got_tli ||
                !got_align || !got_blocksz || !got_largesz || !got_walsz ||
@@ -448,6 +485,15 @@ get_control_data(ClusterInfo *cluster, bool live_check)
                if (!got_oid)
                        pg_log(PG_REPORT, "  latest checkpoint next OID\n");
 
+               if (!got_multi)
+                       pg_log(PG_REPORT, "  latest checkpoint next MultiXactId\n");
+
+               if (!got_mxoff)
+                       pg_log(PG_REPORT, "  latest checkpoint next MultiXactOffset\n");
+
+               if (!got_oldestmulti)
+                       pg_log(PG_REPORT, "  latest checkpoint oldest MultiXactId\n");
+
                if (!live_check && !got_nextxlogfile)
                        pg_log(PG_REPORT, "  first WAL segment after reset\n");
 
index 88494b8d6deeb8ce65f71d2301eb8b8f45efa796..a752fe8eda1a260c8332f6765706ad416e920a70 100644 (file)
@@ -382,6 +382,52 @@ copy_clog_xlog_xid(void)
                          new_cluster.pgdata);
        check_ok();
 
+       /*
+        * If both new and old are after the pg_multixact change commit, copy those
+        * files too.  If the old server is before that change and the new server
+        * is after, then we don't copy anything but we need to reset pg_control so
+        * that the new server doesn't attempt to read multis older than the cutoff
+        * value.
+        */
+       if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
+               new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+       {
+               copy_subdir_files("pg_multixact/offsets");
+               copy_subdir_files("pg_multixact/members");
+               prep_status("Setting next multixact ID and offset for new cluster");
+               /*
+                * we preserve all files and contents, so we must preserve both "next"
+                * counters here and the oldest multi present on system.
+                */
+               exec_prog(UTILITY_LOG_FILE, NULL, true,
+                                 "\"%s/pg_resetxlog\" -O %u -m %u,%u \"%s\"",
+                                 new_cluster.bindir,
+                                 old_cluster.controldata.chkpnt_nxtmxoff,
+                                 old_cluster.controldata.chkpnt_nxtmulti,
+                                 old_cluster.controldata.chkpnt_oldstMulti,
+                                 new_cluster.pgdata);
+               check_ok();
+       }
+       else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+       {
+               prep_status("Setting oldest multixact ID on new cluster");
+               /*
+                * We don't preserve files in this case, but it's important that the
+                * oldest multi is set to the latest value used by the old system, so
+                * that multixact.c returns the empty set for multis that might be
+                * present on disk.  We set next multi to the value following that; it
+                * might end up wrapped around (i.e. 0) if the old cluster had
+                * next=MaxMultiXactId, but multixact.c can cope with that just fine.
+                */
+               exec_prog(UTILITY_LOG_FILE, NULL, true,
+                                 "\"%s/pg_resetxlog\" -m %u,%u \"%s\"",
+                                 new_cluster.bindir,
+                                 old_cluster.controldata.chkpnt_nxtmulti + 1,
+                                 old_cluster.controldata.chkpnt_nxtmulti,
+                                 new_cluster.pgdata);
+               check_ok();
+       }
+
        /* now reset the wal archives in the new cluster */
        prep_status("Resetting WAL archives");
        exec_prog(UTILITY_LOG_FILE, NULL, true,
index d5c3fa9e830bd5545d09c6388ee51e40a9b5437b..70b93816679800bdf3b1c22cae9d12281e0c6d66 100644 (file)
@@ -108,6 +108,10 @@ extern char *output_files[];
  */
 #define VISIBILITY_MAP_CRASHSAFE_CAT_VER 201107031
 
+/*
+ * pg_multixact format changed in this catversion:
+ */
+#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
 
 /*
  * Each relation is represented by a relinfo structure.
@@ -182,6 +186,9 @@ typedef struct
        uint32          chkpnt_tli;
        uint32          chkpnt_nxtxid;
        uint32          chkpnt_nxtoid;
+       uint32          chkpnt_nxtmulti;
+       uint32          chkpnt_nxtmxoff;
+       uint32          chkpnt_oldstMulti;
        uint32          align;
        uint32          blocksz;
        uint32          largesz;
index f56389b0e21c3e5612c361c8ffdf5a8110c14eea..fe8042344f675dd4abc1fee834c2064ab7cfb577 100644 (file)
@@ -4,7 +4,7 @@ MODULE_big      = pgrowlocks
 OBJS           = pgrowlocks.o
 
 EXTENSION = pgrowlocks
-DATA = pgrowlocks--1.0.sql pgrowlocks--unpackaged--1.0.sql
+DATA = pgrowlocks--1.1.sql pgrowlocks--1.0--1.1.sql pgrowlocks--unpackaged--1.0.sql
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql b/contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql
new file mode 100644 (file)
index 0000000..d98cd80
--- /dev/null
@@ -0,0 +1,17 @@
+/* contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pgrowlocks" to load this file. \quit
+
+ALTER EXTENSION pgrowlocks DROP FUNCTION pgrowlocks(text);
+DROP FUNCTION pgrowlocks(text);
+CREATE FUNCTION pgrowlocks(IN relname text,
+    OUT locked_row TID,                -- row TID
+    OUT locker XID,            -- locking XID
+    OUT multi bool,            -- multi XID?
+    OUT xids xid[],            -- multi XIDs
+    OUT modes text[],          -- multi XID statuses
+    OUT pids INTEGER[])                -- locker's process id
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pgrowlocks'
+LANGUAGE C STRICT;
similarity index 83%
rename from contrib/pgrowlocks/pgrowlocks--1.0.sql
rename to contrib/pgrowlocks/pgrowlocks--1.1.sql
index a909b7430d8fffa80006c503f892b08a24640c28..29079f49231739fbd1970f900492021ed4df6b3c 100644 (file)
@@ -1,14 +1,14 @@
-/* contrib/pgrowlocks/pgrowlocks--1.0.sql */
+/* contrib/pgrowlocks/pgrowlocks--1.1.sql */
 
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "CREATE EXTENSION pgrowlocks" to load this file. \quit
 
 CREATE FUNCTION pgrowlocks(IN relname text,
     OUT locked_row TID,                -- row TID
-    OUT lock_type TEXT,                -- lock type
     OUT locker XID,            -- locking XID
     OUT multi bool,            -- multi XID?
     OUT xids xid[],            -- multi XIDs
+    OUT modes text[],          -- multi XID statuses
     OUT pids INTEGER[])                -- locker's process id
 RETURNS SETOF record
 AS 'MODULE_PATHNAME', 'pgrowlocks'
index 20beed2a300b8e88ce31ef36ad1ef589bf861d3d..43ada57352d2f3bb6055e42b200f0c9b228262fa 100644 (file)
@@ -59,6 +59,13 @@ typedef struct
        int                     ncolumns;
 } MyData;
 
+#define                Atnum_tid               0
+#define                Atnum_xmax              1
+#define                Atnum_ismulti   2
+#define                Atnum_xids              3
+#define                Atnum_modes             4
+#define                Atnum_pids              5
+
 Datum
 pgrowlocks(PG_FUNCTION_ARGS)
 {
@@ -117,79 +124,146 @@ pgrowlocks(PG_FUNCTION_ARGS)
        /* scan the relation */
        while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
        {
+               HTSU_Result     htsu;
+               TransactionId xmax;
+               uint16          infomask;
+
                /* must hold a buffer lock to call HeapTupleSatisfiesUpdate */
                LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
 
-               if (HeapTupleSatisfiesUpdate(tuple->t_data,
-                                                                        GetCurrentCommandId(false),
-                                                                        scan->rs_cbuf) == HeapTupleBeingUpdated)
+               htsu = HeapTupleSatisfiesUpdate(tuple->t_data,
+                                                                               GetCurrentCommandId(false),
+                                                                               scan->rs_cbuf);
+               xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
+               infomask = tuple->t_data->t_infomask;
+
+               /*
+                * a tuple is locked if HTSU returns BeingUpdated, and if it returns
+                * MayBeUpdated but the Xmax is valid and pointing at us.
+                */
+               if (htsu == HeapTupleBeingUpdated ||
+                       (htsu == HeapTupleMayBeUpdated &&
+                        !(infomask & HEAP_XMAX_INVALID) &&
+                        !(infomask & HEAP_XMAX_IS_MULTI) &&
+                        (xmax == GetCurrentTransactionIdIfAny())))
                {
-
                        char      **values;
-                       int                     i;
 
                        values = (char **) palloc(mydata->ncolumns * sizeof(char *));
 
-                       i = 0;
-                       values[i++] = (char *) DirectFunctionCall1(tidout, PointerGetDatum(&tuple->t_self));
+                       values[Atnum_tid] = (char *) DirectFunctionCall1(tidout,
+                                                                                                                        PointerGetDatum(&tuple->t_self));
 
-                       if (tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK)
-                               values[i++] = pstrdup("Shared");
-                       else
-                               values[i++] = pstrdup("Exclusive");
-                       values[i] = palloc(NCHARS * sizeof(char));
-                       snprintf(values[i++], NCHARS, "%d", HeapTupleHeaderGetXmax(tuple->t_data));
-                       if (tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)
+                       values[Atnum_xmax] = palloc(NCHARS * sizeof(char));
+                       snprintf(values[Atnum_xmax], NCHARS, "%d", xmax);
+                       if (infomask & HEAP_XMAX_IS_MULTI)
                        {
-                               TransactionId *xids;
-                               int                     nxids;
-                               int                     j;
-                               int                     isValidXid = 0;         /* any valid xid ever exists? */
-
-                               values[i++] = pstrdup("true");
-                               nxids = GetMultiXactIdMembers(HeapTupleHeaderGetXmax(tuple->t_data), &xids);
-                               if (nxids == -1)
+                               MultiXactMember *members;
+                               int                     nmembers;
+                               bool            first = true;
+                               bool            allow_old;
+
+                               values[Atnum_ismulti] = pstrdup("true");
+
+                               allow_old = !(infomask & HEAP_LOCK_MASK) &&
+                                                        (infomask & HEAP_XMAX_LOCK_ONLY);
+                               nmembers = GetMultiXactIdMembers(xmax, &members, allow_old);
+                               if (nmembers == -1)
                                {
-                                       elog(ERROR, "GetMultiXactIdMembers returns error");
+                                       values[Atnum_xids] = "{0}";
+                                       values[Atnum_modes] = "{transient upgrade status}";
+                                       values[Atnum_pids] = "{0}";
                                }
+                               else
+                               {
+                                       int                     j;
 
-                               values[i] = palloc(NCHARS * nxids);
-                               values[i + 1] = palloc(NCHARS * nxids);
-                               strcpy(values[i], "{");
-                               strcpy(values[i + 1], "{");
+                                       values[Atnum_xids] = palloc(NCHARS * nmembers);
+                                       values[Atnum_modes] = palloc(NCHARS * nmembers);
+                                       values[Atnum_pids] = palloc(NCHARS * nmembers);
 
-                               for (j = 0; j < nxids; j++)
-                               {
-                                       char            buf[NCHARS];
+                                       strcpy(values[Atnum_xids], "{");
+                                       strcpy(values[Atnum_modes], "{");
+                                       strcpy(values[Atnum_pids], "{");
 
-                                       if (TransactionIdIsInProgress(xids[j]))
+                                       for (j = 0; j < nmembers; j++)
                                        {
-                                               if (isValidXid)
+                                               char            buf[NCHARS];
+
+                                               if (!first)
                                                {
-                                                       strcat(values[i], ",");
-                                                       strcat(values[i + 1], ",");
+                                                       strcat(values[Atnum_xids], ",");
+                                                       strcat(values[Atnum_modes], ",");
+                                                       strcat(values[Atnum_pids], ",");
                                                }
-                                               snprintf(buf, NCHARS, "%d", xids[j]);
-                                               strcat(values[i], buf);
-                                               snprintf(buf, NCHARS, "%d", BackendXidGetPid(xids[j]));
-                                               strcat(values[i + 1], buf);
+                                               snprintf(buf, NCHARS, "%d", members[j].xid);
+                                               strcat(values[Atnum_xids], buf);
+                                               switch (members[j].status)
+                                               {
+                                                       case MultiXactStatusUpdate:
+                                                               snprintf(buf, NCHARS, "Update");
+                                                               break;
+                                                       case MultiXactStatusNoKeyUpdate:
+                                                               snprintf(buf, NCHARS, "No Key Update");
+                                                               break;
+                                                       case MultiXactStatusForUpdate:
+                                                               snprintf(buf, NCHARS, "For Update");
+                                                               break;
+                                                       case MultiXactStatusForNoKeyUpdate:
+                                                               snprintf(buf, NCHARS, "For No Key Update");
+                                                               break;
+                                                       case MultiXactStatusForShare:
+                                                               snprintf(buf, NCHARS, "Share");
+                                                               break;
+                                                       case MultiXactStatusForKeyShare:
+                                                               snprintf(buf, NCHARS, "Key Share");
+                                                               break;
+                                               }
+                                               strcat(values[Atnum_modes], buf);
+                                               snprintf(buf, NCHARS, "%d",
+                                                                BackendXidGetPid(members[j].xid));
+                                               strcat(values[Atnum_pids], buf);
 
-                                               isValidXid = 1;
+                                               first = false;
                                        }
-                               }
 
-                               strcat(values[i], "}");
-                               strcat(values[i + 1], "}");
-                               i++;
+                                       strcat(values[Atnum_xids], "}");
+                                       strcat(values[Atnum_modes], "}");
+                                       strcat(values[Atnum_pids], "}");
+                               }
                        }
                        else
                        {
-                               values[i++] = pstrdup("false");
-                               values[i] = palloc(NCHARS * sizeof(char));
-                               snprintf(values[i++], NCHARS, "{%d}", HeapTupleHeaderGetXmax(tuple->t_data));
+                               values[Atnum_ismulti] = pstrdup("false");
+
+                               values[Atnum_xids] = palloc(NCHARS * sizeof(char));
+                               snprintf(values[Atnum_xids], NCHARS, "{%d}", xmax);
+
+                               values[Atnum_modes] = palloc(NCHARS);
+                               if (infomask & HEAP_XMAX_LOCK_ONLY)
+                               {
+                                       if (HEAP_XMAX_IS_SHR_LOCKED(infomask))
+                                               snprintf(values[Atnum_modes], NCHARS, "{For Share}");
+                                       else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
+                                               snprintf(values[Atnum_modes], NCHARS, "{For Key Share}");
+                                       else if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
+                                               snprintf(values[Atnum_modes], NCHARS, "{For Update}");
+                                       else
+                                               /* neither keyshare nor exclusive bit it set */
+                                               snprintf(values[Atnum_modes], NCHARS,
+                                                                "{transient upgrade status}");
+                               }
+                               else
+                               {
+                                       if (tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED)
+                                               snprintf(values[Atnum_modes], NCHARS, "{Key Update}");
+                                       else
+                                               snprintf(values[Atnum_modes], NCHARS, "{Update}");
+                               }
 
-                               values[i] = palloc(NCHARS * sizeof(char));
-                               snprintf(values[i++], NCHARS, "{%d}", BackendXidGetPid(HeapTupleHeaderGetXmax(tuple->t_data)));
+                               values[Atnum_pids] = palloc(NCHARS * sizeof(char));
+                               snprintf(values[Atnum_pids], NCHARS, "{%d}",
+                                                BackendXidGetPid(xmax));
                        }
 
                        LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
@@ -200,10 +274,10 @@ pgrowlocks(PG_FUNCTION_ARGS)
                        /* make the tuple into a datum */
                        result = HeapTupleGetDatum(tuple);
 
-                       /* Clean up */
-                       for (i = 0; i < mydata->ncolumns; i++)
-                               pfree(values[i]);
-                       pfree(values);
+                       /*
+                        * no need to pfree what we allocated; it's on a short-lived memory
+                        * context anyway
+                        */
 
                        SRF_RETURN_NEXT(funcctx, result);
                }
index a6ba16451573e12fc67915bb4a23aa68e0179884..dfa587d76180d829e308111489b0097b58e96e85 100644 (file)
@@ -1,5 +1,5 @@
 # pgrowlocks extension
 comment = 'show row-level locking information'
-default_version = '1.0'
+default_version = '1.1'
 module_pathname = '$libdir/pgrowlocks'
 relocatable = true
index 390fa236d31937cc85a3d275ef0f43d4c591470d..c7714d88774f8eeafc82aec761b2de08c3829362 100644 (file)
@@ -43,12 +43,6 @@ pgrowlocks(text) returns setof record
       <entry><type>tid</type></entry>
       <entry>Tuple ID (TID) of locked row</entry>
      </row>
-     <row>
-      <entry><structfield>lock_type</structfield></entry>
-      <entry><type>text</type></entry>
-      <entry><literal>Shared</> for shared lock, or
-             <literal>Exclusive</> for exclusive lock</entry>
-     </row>
      <row>
       <entry><structfield>locker</structfield></entry>
       <entry><type>xid</type></entry>
@@ -64,6 +58,15 @@ pgrowlocks(text) returns setof record
       <entry><type>xid[]</type></entry>
       <entry>Transaction IDs of lockers (more than one if multitransaction)</entry>
      </row>
+     <row>
+      <entry><structfield>lock_type</structfield></entry>
+      <entry><type>text[]</type></entry>
+      <entry>Lock mode of lockers (more than one if multitransaction),
+       an array of <literal>Key Share</>, <literal>Share</>,
+       <literal>For No Key Update</>, <literal>No Key Update</>,
+       <literal>For Update</>, <literal>Update</>.</entry>
+     </row>
+
      <row>
       <entry><structfield>pids</structfield></entry>
       <entry><type>integer[]</type></entry>
index 9963780c3139ccf4c3b763d85b952325833fafe9..26d511fad8c5b8d02bda618006ce2606036db7c7 100644 (file)
@@ -45,7 +45,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="parameter">expression</replac
     [ LIMIT { <replaceable class="parameter">count</replaceable> | ALL } ]
     [ OFFSET <replaceable class="parameter">start</replaceable> [ ROW | ROWS ] ]
     [ FETCH { FIRST | NEXT } [ <replaceable class="parameter">count</replaceable> ] { ROW | ROWS } ONLY ]
-    [ FOR { UPDATE | SHARE } [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ] [ NOWAIT ] [...] ]
+    [ FOR { UPDATE | NO KEY UPDATE | SHARE | KEY SHARE } [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ] [ NOWAIT ] [...] ]
 
 <phrase>where <replaceable class="parameter">from_item</replaceable> can be one of:</phrase>
 
@@ -178,7 +178,8 @@ TABLE [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ]
 
     <listitem>
      <para>
-      If <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal>
+      If <literal>FOR UPDATE</>, <literal>FOR NO KEY UPDATE</literal>, <literal>FOR SHARE</literal>
+      or <literal>FOR KEY SHARE</literal>
       is specified, the
       <command>SELECT</command> statement locks the selected rows
       against concurrent updates.  (See <xref linkend="sql-for-update-share"
@@ -190,8 +191,9 @@ TABLE [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ]
 
   <para>
    You must have <literal>SELECT</literal> privilege on each column used
-   in a <command>SELECT</> command.  The use of <literal>FOR UPDATE</literal>
-   or <literal>FOR SHARE</literal> requires
+   in a <command>SELECT</> command.  The use of <literal>FOR NO KEY UPDATE</>,
+   <literal>FOR UPDATE</literal>,
+   <literal>FOR SHARE</literal> or <literal>FOR KEY SHARE</literal> requires
    <literal>UPDATE</literal> privilege as well (for at least one column
    of each table so selected).
   </para>
@@ -873,8 +875,8 @@ SELECT DISTINCT ON (location) location, time, report
 <replaceable class="parameter">select_statement</replaceable> UNION [ ALL | DISTINCT ] <replaceable class="parameter">select_statement</replaceable>
 </synopsis><replaceable class="parameter">select_statement</replaceable> is
     any <command>SELECT</command> statement without an <literal>ORDER
-    BY</>, <literal>LIMIT</>, <literal>FOR UPDATE</literal>, or
-    <literal>FOR SHARE</literal> clause.
+    BY</>, <literal>LIMIT</>, <literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</literal>,
+    <literal>FOR SHARE</literal>, or <literal>FOR KEY SHARE</literal> clause.
     (<literal>ORDER BY</> and <literal>LIMIT</> can be attached to a
     subexpression if it is enclosed in parentheses.  Without
     parentheses, these clauses will be taken to apply to the result of
@@ -910,7 +912,8 @@ SELECT DISTINCT ON (location) location, time, report
    </para>
 
    <para>
-    Currently, <literal>FOR UPDATE</> and <literal>FOR SHARE</> cannot be
+    Currently, <literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</>, <literal>FOR SHARE</> and
+    <literal>FOR KEY SHARE</> cannot be
     specified either for a <literal>UNION</> result or for any input of a
     <literal>UNION</>.
    </para>
@@ -925,8 +928,8 @@ SELECT DISTINCT ON (location) location, time, report
 <replaceable class="parameter">select_statement</replaceable> INTERSECT [ ALL | DISTINCT ] <replaceable class="parameter">select_statement</replaceable>
 </synopsis><replaceable class="parameter">select_statement</replaceable> is
     any <command>SELECT</command> statement without an <literal>ORDER
-    BY</>, <literal>LIMIT</>, <literal>FOR UPDATE</literal>, or
-    <literal>FOR SHARE</literal> clause.
+    BY</>, <literal>LIMIT</>, <literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</literal>,
+    <literal>FOR SHARE</literal>, or <literal>FOR KEY SHARE</> clause.
    </para>
 
    <para>
@@ -957,7 +960,8 @@ SELECT DISTINCT ON (location) location, time, report
    </para>
 
    <para>
-    Currently, <literal>FOR UPDATE</> and <literal>FOR SHARE</> cannot be
+    Currently, <literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</>, <literal>FOR SHARE</> and
+    <literal>FOR KEY SHARE</> cannot be
     specified either for an <literal>INTERSECT</> result or for any input of
     an <literal>INTERSECT</>.
    </para>
@@ -972,8 +976,8 @@ SELECT DISTINCT ON (location) location, time, report
 <replaceable class="parameter">select_statement</replaceable> EXCEPT [ ALL | DISTINCT ] <replaceable class="parameter">select_statement</replaceable>
 </synopsis><replaceable class="parameter">select_statement</replaceable> is
     any <command>SELECT</command> statement without an <literal>ORDER
-    BY</>, <literal>LIMIT</>, <literal>FOR UPDATE</literal>, or
-    <literal>FOR SHARE</literal> clause.
+    BY</>, <literal>LIMIT</>, <literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</literal>,
+    <literal>FOR SHARE</literal>, or <literal>FOR KEY SHARE</> clause.
    </para>
 
    <para>
@@ -1000,7 +1004,8 @@ SELECT DISTINCT ON (location) location, time, report
    </para>
 
    <para>
-    Currently, <literal>FOR UPDATE</> and <literal>FOR SHARE</> cannot be
+    Currently, <literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</>, <literal>FOR SHARE</> and
+    <literal>FOR KEY SHARE</> cannot be
     specified either for an <literal>EXCEPT</> result or for any input of
     an <literal>EXCEPT</>.
    </para>
@@ -1185,7 +1190,14 @@ FETCH { FIRST | NEXT } [ <replaceable class="parameter">count</replaceable> ] {
   </refsect2>
 
   <refsect2 id="SQL-FOR-UPDATE-SHARE">
-   <title id="sql-for-update-share-title"><literal>FOR UPDATE</literal>/<literal>FOR SHARE</literal> Clause</title>
+   <title id="sql-for-update-share-title"><literal>FOR UPDATE</>, <literal>FOR NO KEY UPDATE</>/<literal>FOR SHARE</>/<literal>FOR KEY SHARE</> Clauses</title>
+
+   <para>
+    <literal>FOR UPDATE</>, <literal>FOR NO KEY UPDATE</>, <literal>FOR SHARE</>
+    and <literal>FOR KEY SHARE</>
+    are <firstterm>locking clauses</>; they affect how <literal>SELECT</>
+    locks rows as they are obtained from the table.
+   </para>
 
    <para>
     The <literal>FOR UPDATE</literal> clause has this form:
@@ -1194,6 +1206,13 @@ FOR UPDATE [ OF <replaceable class="parameter">table_name</replaceable> [, ...]
 </synopsis>
    </para>
 
+   <para>
+    The <literal>FOR NO KEY UPDATE</literal> clause has this form:
+<synopsis>
+FOR NO KEY UPDATE [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ] [ NOWAIT ]
+</synopsis>
+   </para>
+
    <para>
     The closely related <literal>FOR SHARE</literal> clause has this form:
 <synopsis>
@@ -1201,14 +1220,31 @@ FOR SHARE [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ]
 </synopsis>
    </para>
 
+   <para>
+    Similarly, the <literal>FOR KEY SHARE</> clause has this form:
+<synopsis>
+FOR KEY SHARE [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ] [ NOWAIT ]
+</synopsis>
+   </para>
+
    <para>
     <literal>FOR UPDATE</literal> causes the rows retrieved by the
     <command>SELECT</command> statement to be locked as though for
     update.  This prevents them from being modified or deleted by
     other transactions until the current transaction ends.  That is,
     other transactions that attempt <command>UPDATE</command>,
-    <command>DELETE</command>, or <command>SELECT FOR UPDATE</command>
+    <command>DELETE</command>,
+    <command>SELECT FOR UPDATE</command>,
+    <command>SELECT FOR SHARE</command> or
+    <command>SELECT FOR KEY SHARE</command>
     of these rows will be blocked until the current transaction ends.
+    The <literal>FOR UPDATE</> lock mode
+    is also acquired by any <command>DELETE</> on a row, and also by an
+    <command>UPDATE</> that modifies the values on certain columns.  Currently,
+    the set of columns considered for the <command>UPDATE</> case are those that
+    have an unique index on them that can be used in a foreign key (so partial
+    indexes and expressional indexes are not considered), but this may change
+    in the future.
     Also, if an <command>UPDATE</command>, <command>DELETE</command>,
     or <command>SELECT FOR UPDATE</command> from another transaction
     has already locked a selected row or rows, <command>SELECT FOR
@@ -1220,13 +1256,33 @@ FOR SHARE [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ]
     linkend="mvcc">.
    </para>
 
+   <para>
+    <literal>FOR NO KEY UPDATE</> behaves similarly, except that the lock
+    acquired is weaker: this lock will not block
+    <literal>SELECT FOR KEY SHARE</> commands that attempt to acquire
+    a lock on the same rows.
+   </para>
+
    <para>
     <literal>FOR SHARE</literal> behaves similarly, except that it
     acquires a shared rather than exclusive lock on each retrieved
     row.  A shared lock blocks other transactions from performing
     <command>UPDATE</command>, <command>DELETE</command>, or <command>SELECT
     FOR UPDATE</command> on these rows, but it does not prevent them
-    from performing <command>SELECT FOR SHARE</command>.
+    from performing <command>SELECT FOR SHARE</command> or
+    <command>SELECT FOR KEY SHARE</command>.
+   </para>
+
+   <para>
+    <literal>FOR KEY SHARE</> behaves similarly to <literal>FOR SHARE</literal>,
+    except that the lock
+    is weaker: <literal>SELECT FOR UPDATE</> is blocked, but
+    not <literal>SELECT FOR NO KEY UPDATE</>.  A key-shared
+    lock blocks other transactions from performing <command>DELETE</command>
+    or any <command>UPDATE</command> that changes the key values, but not
+    other <command>UPDATE</>, and neither it does prevent
+    <command>SELECT FOR UPDATE</>, <command>SELECT FOR SHARE</>, or
+    <command>SELECT FOR KEY SHARE</>.
    </para>
 
    <para>
@@ -1243,41 +1299,39 @@ FOR SHARE [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ]
    </para>
 
    <para>
-    If specific tables are named in <literal>FOR UPDATE</literal>
-    or <literal>FOR SHARE</literal>,
+    If specific tables are named in a locking clause,
     then only rows coming from those tables are locked; any other
     tables used in the <command>SELECT</command> are simply read as
-    usual.  A <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal>
+    usual.  A locking
     clause without a table list affects all tables used in the statement.
-    If <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal> is
+    If a locking clause is
     applied to a view or sub-query, it affects all tables used in
     the view or sub-query.
-    However, <literal>FOR UPDATE</literal>/<literal>FOR SHARE</literal>
+    However, these clauses
     do not apply to <literal>WITH</> queries referenced by the primary query.
     If you want row locking to occur within a <literal>WITH</> query, specify
-    <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal> within the
-    <literal>WITH</> query.
+    a locking clause within the <literal>WITH</> query.
    </para>
 
    <para>
-    Multiple <literal>FOR UPDATE</literal> and <literal>FOR SHARE</literal>
+    Multiple locking
     clauses can be written if it is necessary to specify different locking
     behavior for different tables.  If the same table is mentioned (or
-    implicitly affected) by both <literal>FOR UPDATE</literal> and
-    <literal>FOR SHARE</literal> clauses, then it is processed as
-    <literal>FOR UPDATE</literal>.  Similarly, a table is processed
+    implicitly affected) by more than one locking clause,
+    then it is processed as if it was only specified by the strongest one.
+    Similarly, a table is processed
     as <literal>NOWAIT</> if that is specified in any of the clauses
     affecting it.
    </para>
 
    <para>
-    <literal>FOR UPDATE</literal> and <literal>FOR SHARE</literal> cannot be
+    The locking clauses cannot be
     used in contexts where returned rows cannot be clearly identified with
     individual table rows; for example they cannot be used with aggregation.
    </para>
 
    <para>
-    When <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal>
+    When a locking clause
     appears at the top level of a <command>SELECT</> query, the rows that
     are locked are exactly those that are returned by the query; in the
     case of a join query, the rows locked are those that contribute to
@@ -1288,13 +1342,13 @@ FOR SHARE [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ]
     <literal>LIMIT</> is used, locking stops
     once enough rows have been returned to satisfy the limit (but note that
     rows skipped over by <literal>OFFSET</> will get locked).  Similarly,
-    if <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal>
+    if a locking clause
     is used in a cursor's query, only rows actually fetched or stepped past
     by the cursor will be locked.
    </para>
 
    <para>
-    When <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal>
+    When a locking clause
     appears in a sub-<command>SELECT</>, the rows locked are those
     returned to the outer query by the sub-query.  This might involve
     fewer rows than inspection of the sub-query alone would suggest,
@@ -1307,11 +1361,9 @@ SELECT * FROM (SELECT * FROM mytable FOR UPDATE) ss WHERE col1 = 5;
     condition is not textually within the sub-query.
    </para>
 
-  <caution>
-   <para>
-    Avoid locking a row and then modifying it within a later savepoint or
-    <application>PL/pgSQL</application> exception block.  A subsequent
-    rollback would cause the lock to be lost.  For example:
+  <para>
+   Previous releases failed to preserve a lock which is upgraded by a later
+   savepoint.  For example, this code:
 <programlisting>
 BEGIN;
 SELECT * FROM mytable WHERE key = 1 FOR UPDATE;
@@ -1319,23 +1371,15 @@ SAVEPOINT s;
 UPDATE mytable SET ... WHERE key = 1;
 ROLLBACK TO s;
 </programlisting>
-    After the <command>ROLLBACK</>, the row is effectively unlocked, rather
-    than returned to its pre-savepoint state of being locked but not modified.
-    This hazard occurs if a row locked in the current transaction is updated
-    or deleted, or if a shared lock is upgraded to exclusive: in all these
-    cases, the former lock state is forgotten.  If the transaction is then
-    rolled back to a state between the original locking command and the
-    subsequent change, the row will appear not to be locked at all.  This is
-    an implementation deficiency which will be addressed in a future release
-    of <productname>PostgreSQL</productname>.
-   </para>
-  </caution>
+   would fail to preserve the <literal>FOR UPDATE</> lock after the
+   <command>ROLLBACK</>.  This has been fixed in release 9.2.
+  </para>
 
   <caution>
    <para>
     It is possible for a <command>SELECT</> command running at the <literal>READ
     COMMITTED</literal> transaction isolation level and using <literal>ORDER
-    BY</literal> and <literal>FOR UPDATE/SHARE</literal> to return rows out of
+    BY</literal> and a locking clause to return rows out of
     order.  This is because <literal>ORDER BY</> is applied first.
     The command sorts the result, but might then block trying to obtain a lock
     on one or more of the rows.  Once the <literal>SELECT</> unblocks, some
@@ -1765,14 +1809,16 @@ SELECT distributors.* WHERE distributors.name = 'Westward';
   </refsect2>
 
   <refsect2>
-   <title><literal>FOR UPDATE</> and <literal>FOR SHARE</></title>
+   <title><literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</>, <literal>FOR SHARE</>, <literal>FOR KEY SHARE</></title>
 
    <para>
     Although <literal>FOR UPDATE</> appears in the SQL standard, the
     standard allows it only as an option of <command>DECLARE CURSOR</>.
     <productname>PostgreSQL</productname> allows it in any <command>SELECT</>
     query as well as in sub-<command>SELECT</>s, but this is an extension.
-    The <literal>FOR SHARE</> variant, and the <literal>NOWAIT</> option,
+    The <literal>FOR NO KEY UPDATE</>, <literal>FOR SHARE</> and
+    <literal>FOR KEY SHARE</> variants,
+    as well as the <literal>NOWAIT</> option,
     do not appear in the standard.
    </para>
   </refsect2>
index 0706e3afc2df5e7cf1e2eca73dd9602d084f9795..e39b9770cbf519cd65630f0332e92c004d4d4f7d 100644 (file)
@@ -542,7 +542,7 @@ heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
                        result = TransactionIdGetDatum(HeapTupleHeaderGetXmin(tup->t_data));
                        break;
                case MaxTransactionIdAttributeNumber:
-                       result = TransactionIdGetDatum(HeapTupleHeaderGetXmax(tup->t_data));
+                       result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmax(tup->t_data));
                        break;
                case MinCommandIdAttributeNumber:
                case MaxCommandIdAttributeNumber:
diff --git a/src/backend/access/heap/README.tuplock b/src/backend/access/heap/README.tuplock
new file mode 100644 (file)
index 0000000..8d5cc16
--- /dev/null
@@ -0,0 +1,139 @@
+Locking tuples
+--------------
+
+Locking tuples is not as easy as locking tables or other database objects.
+The problem is that transactions might want to lock large numbers of tuples at
+any one time, so it's not possible to keep the locks objects in shared memory.
+To work around this limitation, we use a two-level mechanism.  The first level
+is implemented by storing locking information in the tuple header: a tuple is
+marked as locked by setting the current transaction's XID as its XMAX, and
+setting additional infomask bits to distinguish this case from the more normal
+case of having deleted the tuple.  When multiple transactions concurrently
+lock a tuple, a MultiXact is used; see below.  This mechanism can accomodate
+arbitrarily large numbers of tuples being locked simultaneously.
+
+When it is necessary to wait for a tuple-level lock to be released, the basic
+delay is provided by XactLockTableWait or MultiXactIdWait on the contents of
+the tuple's XMAX.  However, that mechanism will release all waiters
+concurrently, so there would be a race condition as to which waiter gets the
+tuple, potentially leading to indefinite starvation of some waiters.  The
+possibility of share-locking makes the problem much worse --- a steady stream
+of share-lockers can easily block an exclusive locker forever.  To provide
+more reliable semantics about who gets a tuple-level lock first, we use the
+standard lock manager, which implements the second level mentioned above.  The
+protocol for waiting for a tuple-level lock is really
+
+     LockTuple()
+     XactLockTableWait()
+     mark tuple as locked by me
+     UnlockTuple()
+
+When there are multiple waiters, arbitration of who is to get the lock next
+is provided by LockTuple().  However, at most one tuple-level lock will
+be held or awaited per backend at any time, so we don't risk overflow
+of the lock table.  Note that incoming share-lockers are required to
+do LockTuple as well, if there is any conflict, to ensure that they don't
+starve out waiting exclusive-lockers.  However, if there is not any active
+conflict for a tuple, we don't incur any extra overhead.
+
+We provide four levels of tuple locking strength: SELECT FOR KEY UPDATE is
+super-exclusive locking (used to delete tuples and more generally to update
+tuples modifying the values of the columns that make up the key of the tuple);
+SELECT FOR UPDATE is a standards-compliant exclusive lock; SELECT FOR SHARE
+implements shared locks; and finally SELECT FOR KEY SHARE is a super-weak mode
+that does not conflict with exclusive mode, but conflicts with SELECT FOR KEY
+UPDATE.  This last mode implements a mode just strong enough to implement RI
+checks, i.e. it ensures that tuples do not go away from under a check, without
+blocking when some other transaction that want to update the tuple without
+changing its key.
+
+The conflict table is:
+
+                KEY UPDATE        UPDATE        SHARE        KEY SHARE
+KEY UPDATE       conflict        conflict      conflict      conflict
+UPDATE           conflict        conflict      conflict
+SHARE            conflict        conflict
+KEY SHARE        conflict
+
+When there is a single locker in a tuple, we can just store the locking info
+in the tuple itself.  We do this by storing the locker's Xid in XMAX, and
+setting infomask bits specifying the locking strength.  There is one exception
+here: since infomask space is limited, we do not provide a separate bit
+for SELECT FOR SHARE, so we have to use the extended info in a MultiXact in
+that case.  (The other cases, SELECT FOR UPDATE and SELECT FOR KEY SHARE, are
+presumably more commonly used due to being the standards-mandated locking
+mechanism, or heavily used by the RI code, so we want to provide fast paths
+for those.)
+
+MultiXacts
+----------
+
+A tuple header provides very limited space for storing information about tuple
+locking and updates: there is room only for a single Xid and a small number of
+infomask bits.  Whenever we need to store more than one lock, we replace the
+first locker's Xid with a new MultiXactId.  Each MultiXact provides extended
+locking data; it comprises an array of Xids plus some flags bits for each one.
+The flags are currently used to store the locking strength of each member
+transaction.  (The flags also distinguish a pure locker from an updater.)
+
+In earlier PostgreSQL releases, a MultiXact always meant that the tuple was
+locked in shared mode by multiple transactions.  This is no longer the case; a
+MultiXact may contain an update or delete Xid.  (Keep in mind that tuple locks
+in a transaction do not conflict with other tuple locks in the same
+transaction, so it's possible to have otherwise conflicting locks in a
+MultiXact if they belong to the same transaction).
+
+Note that each lock is attributed to the subtransaction that acquires it.
+This means that a subtransaction that aborts is seen as though it releases the
+locks it acquired; concurrent transactions can then proceed without having to
+wait for the main transaction to finish.  It also means that a subtransaction
+can upgrade to a stronger lock level than an earlier transaction had, and if
+the subxact aborts, the earlier, weaker lock is kept.
+
+The possibility of having an update within a MultiXact means that they must
+persist across crashes and restarts: a future reader of the tuple needs to
+figure out whether the update committed or aborted.  So we have a requirement
+that pg_multixact needs to retain pages of its data until we're certain that
+the MultiXacts in them are no longer of interest.
+
+VACUUM is in charge of removing old MultiXacts at the time of tuple freezing.
+This works in the same way that pg_clog segments are removed: we have a
+pg_class column that stores the earliest multixact that could possibly be
+stored in the table; the minimum of all such values is stored in a pg_database
+column.  VACUUM computes the minimum across all pg_database values, and
+removes pg_multixact segments older than the minimum.
+
+Infomask Bits
+-------------
+
+The following infomask bits are applicable:
+
+- HEAP_XMAX_INVALID
+  Any tuple with this bit set does not have a valid value stored in XMAX.
+
+- HEAP_XMAX_IS_MULTI
+  This bit is set if the tuple's Xmax is a MultiXactId (as opposed to a
+  regular TransactionId).
+
+- HEAP_XMAX_LOCK_ONLY
+  This bit is set when the XMAX is a locker only; that is, if it's a
+  multixact, it does not contain an update among its members.  It's set when
+  the XMAX is a plain Xid that locked the tuple, as well.
+
+- HEAP_XMAX_KEYSHR_LOCK
+- HEAP_XMAX_EXCL_LOCK
+  These bits indicate the strength of the lock acquired; they are useful when
+  the XMAX is not a MultiXactId.  If it's a multi, the info is to be found in
+  the member flags.  If HEAP_XMAX_IS_MULTI is not set and HEAP_XMAX_LOCK_ONLY
+  is set, then one of these *must* be set as well.
+  Note there is no infomask bit for a SELECT FOR SHARE lock.  Also there is no
+  separate bit for a SELECT FOR KEY UPDATE lock; this is implemented by the
+  HEAP_KEYS_UPDATED bit.
+
+- HEAP_KEYS_UPDATED
+  This bit lives in t_infomask2.  If set, indicates that the XMAX updated
+  this tuple and changed the key values, or it deleted the tuple.
+  It's set regardless of whether the XMAX is a TransactionId or a MultiXactId.
+
+We currently never set the HEAP_XMAX_COMMITTED when the HEAP_XMAX_IS_MULTI bit
+is set.
index b19d1cf6c5746f3d64fe0bc24ef5222afdf154ff..57d47e8601443d592982e6faffb7855391790b95 100644 (file)
@@ -84,12 +84,105 @@ static HeapScanDesc heap_beginscan_internal(Relation relation,
 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
                                        TransactionId xid, CommandId cid, int options);
 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
-                               ItemPointerData from, Buffer newbuf, HeapTuple newtup,
-                               bool all_visible_cleared, bool new_all_visible_cleared);
-static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
-                                          HeapTuple oldtup, HeapTuple newtup);
+                               Buffer newbuf, HeapTuple oldtup,
+                               HeapTuple newtup, bool all_visible_cleared,
+                               bool new_all_visible_cleared);
+static void HeapSatisfiesHOTandKeyUpdate(Relation relation,
+                                                        Bitmapset *hot_attrs, Bitmapset *key_attrs,
+                                                        bool *satisfies_hot, bool *satisfies_key,
+                                                        HeapTuple oldtup, HeapTuple newtup);
+static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
+                                                 uint16 old_infomask2, TransactionId add_to_xmax,
+                                                 LockTupleMode mode, bool is_update,
+                                                 TransactionId *result_xmax, uint16 *result_infomask,
+                                                 uint16 *result_infomask2);
+static HTSU_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
+                                               ItemPointer ctid, TransactionId xid,
+                                               LockTupleMode mode);
+static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
+                                          uint16 *new_infomask2);
+static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
+                                               uint16 t_infomask);
+static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+                               int *remaining, uint16 infomask);
+static bool ConditionalMultiXactIdWait(MultiXactId multi,
+                                                  MultiXactStatus status, int *remaining,
+                                                  uint16 infomask);
 
 
+/*
+ * Each tuple lock mode has a corresponding heavyweight lock, and one or two
+ * corresponding MultiXactStatuses (one to merely lock tuples, another one to
+ * update them).  This table (and the macros below) helps us determine the
+ * heavyweight lock mode and MultiXactStatus values to use for any particular
+ * tuple lock strength.
+ */
+static const struct
+{
+       LOCKMODE        hwlock;
+       MultiXactStatus lockstatus;
+       MultiXactStatus updstatus;
+}
+tupleLockExtraInfo[MaxLockTupleMode + 1] =
+{
+       {       /* LockTupleKeyShare */
+               AccessShareLock,
+               MultiXactStatusForKeyShare,
+               -1      /* KeyShare does not allow updating tuples */
+       },
+       {       /* LockTupleShare */
+               RowShareLock,
+               MultiXactStatusForShare,
+               -1      /* Share does not allow updating tuples */
+       },
+       {       /* LockTupleNoKeyExclusive */
+               ExclusiveLock,
+               MultiXactStatusForNoKeyUpdate,
+               MultiXactStatusNoKeyUpdate
+       },
+       {       /* LockTupleExclusive */
+               AccessExclusiveLock,
+               MultiXactStatusForUpdate,
+               MultiXactStatusUpdate
+       }
+};
+/* Get the LOCKMODE for a given MultiXactStatus */
+#define LOCKMODE_from_mxstatus(status) \
+                       (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
+
+/*
+ * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
+ * This is more readable than having every caller translate it to lock.h's
+ * LOCKMODE.
+ */
+#define LockTupleTuplock(rel, tup, mode) \
+       LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
+#define UnlockTupleTuplock(rel, tup, mode) \
+       UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
+#define ConditionalLockTupleTuplock(rel, tup, mode) \
+       ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
+
+/*
+ * This table maps tuple lock strength values for each particular
+ * MultiXactStatus value.
+ */
+static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
+{
+       LockTupleKeyShare,              /* ForKeyShare */
+       LockTupleShare,                 /* ForShare */
+       LockTupleNoKeyExclusive,                /* ForNoKeyUpdate */
+       LockTupleExclusive,             /* ForUpdate */
+       LockTupleNoKeyExclusive,                /* NoKeyUpdate */
+       LockTupleExclusive              /* Update */
+};
+
+/* Get the LockTupleMode for a given MultiXactStatus */
+#define TUPLOCK_from_mxstatus(status) \
+                       (MultiXactStatusLock[(status)])
+/* Get the is_update bit for a given MultiXactStatus */
+#define ISUPDATE_from_mxstatus(status) \
+                       ((status) > MultiXactStatusForUpdate)
+
 /* ----------------------------------------------------------------
  *                                              heap support routines
  * ----------------------------------------------------------------
@@ -1664,7 +1757,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
                                   ItemPointerGetBlockNumber(tid));
                        offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
                        at_chain_start = false;
-                       prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+                       prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
                }
                else
                        break;                          /* end of chain */
@@ -1787,7 +1880,7 @@ heap_get_latest_tid(Relation relation,
                 * tuple.  Check for XMIN match.
                 */
                if (TransactionIdIsValid(priorXmax) &&
-                 !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
+                       !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
                {
                        UnlockReleaseBuffer(buffer);
                        break;
@@ -1805,7 +1898,8 @@ heap_get_latest_tid(Relation relation,
                /*
                 * If there's a valid t_ctid link, follow it, else we're done.
                 */
-               if ((tp.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) ||
+               if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
+                       HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
                        ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
                {
                        UnlockReleaseBuffer(buffer);
@@ -1813,7 +1907,7 @@ heap_get_latest_tid(Relation relation,
                }
 
                ctid = tp.t_data->t_ctid;
-               priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
+               priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
                UnlockReleaseBuffer(buffer);
        }                                                       /* end of loop */
 }
@@ -1826,17 +1920,25 @@ heap_get_latest_tid(Relation relation,
  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
  * be set on exit.     If the transaction committed, we set the XMAX_COMMITTED
  * hint bit if possible --- but beware that that may not yet be possible,
- * if the transaction committed asynchronously.  Hence callers should look
- * only at XMAX_INVALID.
+ * if the transaction committed asynchronously.
+ *
+ * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
+ * even if it commits.
+ *
+ * Hence callers should look only at XMAX_INVALID.
+ *
+ * Note this is not allowed for tuples whose xmax is a multixact.
  */
 static void
 UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
 {
-       Assert(TransactionIdEquals(HeapTupleHeaderGetXmax(tuple), xid));
+       Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
+       Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
 
        if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
        {
-               if (TransactionIdDidCommit(xid))
+               if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
+                       TransactionIdDidCommit(xid))
                        HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
                                                                 xid);
                else
@@ -2373,6 +2475,26 @@ simple_heap_insert(Relation relation, HeapTuple tup)
        return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
 }
 
+/*
+ * Given infomask/infomask2, compute the bits that must be saved in the
+ * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
+ * xl_heap_lock_updated WAL records.
+ *
+ * See fix_infomask_from_infobits.
+ */
+static uint8
+compute_infobits(uint16 infomask, uint16 infomask2)
+{
+       return
+               ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
+               ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
+               ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
+               /* note we ignore HEAP_XMAX_SHR_LOCK here */
+               ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
+               ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
+                XLHL_KEYS_UPDATED : 0);
+}
+
 /*
  *     heap_delete - delete a tuple
  *
@@ -2393,7 +2515,8 @@ simple_heap_insert(Relation relation, HeapTuple tup)
  * (the last only possible if wait == false).
  *
  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
- * t_xmax, and t_cmax (the last only for HeapTupleSelfUpdated, since we
+ * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
+ * (the last only for HeapTupleSelfUpdated, since we
  * cannot obtain cmax from a combocid generated by another transaction).
  * See comments for struct HeapUpdateFailureData for additional info.
  */
@@ -2410,6 +2533,9 @@ heap_delete(Relation relation, ItemPointer tid,
        BlockNumber block;
        Buffer          buffer;
        Buffer          vmbuffer = InvalidBuffer;
+       TransactionId new_xmax;
+       uint16          new_infomask,
+                               new_infomask2;
        bool            have_tuple_lock = false;
        bool            iscombo;
        bool            all_visible_cleared = false;
@@ -2465,7 +2591,7 @@ l1:
                uint16          infomask;
 
                /* must copy state data before unlocking buffer */
-               xwait = HeapTupleHeaderGetXmax(tp.t_data);
+               xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
                infomask = tp.t_data->t_infomask;
 
                LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
@@ -2481,20 +2607,20 @@ l1:
                 */
                if (!have_tuple_lock)
                {
-                       LockTuple(relation, &(tp.t_self), ExclusiveLock);
+                       LockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
                        have_tuple_lock = true;
                }
 
                /*
                 * Sleep until concurrent transaction ends.  Note that we don't care
-                * if the locker has an exclusive or shared lock, because we need
-                * exclusive.
+                * which lock mode the locker has, because we need the strongest one.
                 */
 
                if (infomask & HEAP_XMAX_IS_MULTI)
                {
                        /* wait for multixact */
-                       MultiXactIdWait((MultiXactId) xwait);
+                       MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate,
+                                                       NULL, infomask);
                        LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
                        /*
@@ -2503,7 +2629,7 @@ l1:
                         * change, and start over if so.
                         */
                        if (!(tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-                               !TransactionIdEquals(HeapTupleHeaderGetXmax(tp.t_data),
+                               !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
                                                                         xwait))
                                goto l1;
 
@@ -2529,7 +2655,7 @@ l1:
                         * Check for xmax change, and start over if so.
                         */
                        if ((tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-                               !TransactionIdEquals(HeapTupleHeaderGetXmax(tp.t_data),
+                               !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
                                                                         xwait))
                                goto l1;
 
@@ -2541,8 +2667,9 @@ l1:
                 * We may overwrite if previous xmax aborted, or if it committed but
                 * only locked the tuple without updating it.
                 */
-               if (tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
-                                                                        HEAP_IS_LOCKED))
+               if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
+                       HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
+                       HeapTupleHeaderIsOnlyLocked(tp.t_data))
                        result = HeapTupleMayBeUpdated;
                else
                        result = HeapTupleUpdated;
@@ -2562,14 +2689,14 @@ l1:
                           result == HeapTupleBeingUpdated);
                Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
                hufd->ctid = tp.t_data->t_ctid;
-               hufd->xmax = HeapTupleHeaderGetXmax(tp.t_data);
+               hufd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
                if (result == HeapTupleSelfUpdated)
                        hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
                else
                        hufd->cmax = 0;         /* for lack of an InvalidCommandId value */
                UnlockReleaseBuffer(buffer);
                if (have_tuple_lock)
-                       UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
+                       UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
                if (vmbuffer != InvalidBuffer)
                        ReleaseBuffer(vmbuffer);
                return result;
@@ -2603,14 +2730,29 @@ l1:
                                                        vmbuffer);
        }
 
+       /*
+        * If this is the first possibly-multixact-able operation in the
+        * current transaction, set my per-backend OldestMemberMXactId setting.
+        * We can be certain that the transaction will never become a member of
+        * any older MultiXactIds than that.  (We have to do this even if we
+        * end up just using our own TransactionId below, since some other
+        * backend could incorporate our XID into a MultiXact immediately
+        * afterwards.)
+        */
+       MultiXactIdSetOldestMember();
+
+       compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
+                                                         tp.t_data->t_infomask, tp.t_data->t_infomask2,
+                                                         xid, LockTupleExclusive, true,
+                                                         &new_xmax, &new_infomask, &new_infomask2);
+
        /* store transaction information of xact deleting the tuple */
-       tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
-                                                          HEAP_XMAX_INVALID |
-                                                          HEAP_XMAX_IS_MULTI |
-                                                          HEAP_IS_LOCKED |
-                                                          HEAP_MOVED);
+       tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+       tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+       tp.t_data->t_infomask |= new_infomask;
+       tp.t_data->t_infomask2 |= new_infomask2;
        HeapTupleHeaderClearHotUpdated(tp.t_data);
-       HeapTupleHeaderSetXmax(tp.t_data, xid);
+       HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
        HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
        /* Make sure there is no forward chain link in t_ctid */
        tp.t_data->t_ctid = tp.t_self;
@@ -2625,8 +2767,11 @@ l1:
                XLogRecData rdata[2];
 
                xlrec.all_visible_cleared = all_visible_cleared;
+               xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
+                                                                                         tp.t_data->t_infomask2);
                xlrec.target.node = relation->rd_node;
                xlrec.target.tid = tp.t_self;
+               xlrec.xmax = new_xmax;
                rdata[0].data = (char *) &xlrec;
                rdata[0].len = SizeOfHeapDelete;
                rdata[0].buffer = InvalidBuffer;
@@ -2679,7 +2824,7 @@ l1:
         * Release the lmgr tuple lock, if we had it.
         */
        if (have_tuple_lock)
-               UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
+               UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
 
        pgstat_count_heap_delete(relation);
 
@@ -2739,6 +2884,7 @@ simple_heap_delete(Relation relation, ItemPointer tid)
  *     crosscheck - if not InvalidSnapshot, also check old tuple against this
  *     wait - true if should wait for any conflicting update to commit/abort
  *     hufd - output parameter, filled in failure cases (see below)
+ *     lockmode - output parameter, filled with lock mode acquired on tuple
  *
  * Normal, successful return value is HeapTupleMayBeUpdated, which
  * actually means we *did* update it.  Failure return codes are
@@ -2752,23 +2898,26 @@ simple_heap_delete(Relation relation, ItemPointer tid)
  * data are not reflected into *newtup.
  *
  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
- * t_xmax, and t_cmax (the last only for HeapTupleSelfUpdated, since we
+ * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
+ * (the last only for HeapTupleSelfUpdated, since we
  * cannot obtain cmax from a combocid generated by another transaction).
  * See comments for struct HeapUpdateFailureData for additional info.
  */
 HTSU_Result
 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
                        CommandId cid, Snapshot crosscheck, bool wait,
-                       HeapUpdateFailureData *hufd)
+                       HeapUpdateFailureData *hufd, LockTupleMode *lockmode)
 {
        HTSU_Result result;
        TransactionId xid = GetCurrentTransactionId();
        Bitmapset  *hot_attrs;
+       Bitmapset  *key_attrs;
        ItemId          lp;
        HeapTupleData oldtup;
        HeapTuple       heaptup;
        Page            page;
        BlockNumber block;
+       MultiXactStatus mxact_status;
        Buffer          buffer,
                                newbuf,
                                vmbuffer = InvalidBuffer,
@@ -2779,9 +2928,20 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
                                pagefree;
        bool            have_tuple_lock = false;
        bool            iscombo;
+       bool            satisfies_hot;
+       bool            satisfies_key;
        bool            use_hot_update = false;
+       bool            key_intact;
        bool            all_visible_cleared = false;
        bool            all_visible_cleared_new = false;
+       bool            checked_lockers;
+       bool            locker_remains;
+       TransactionId xmax_new_tuple,
+                                 xmax_old_tuple;
+       uint16          infomask_old_tuple,
+                               infomask2_old_tuple,
+                               infomask_new_tuple,
+                               infomask2_new_tuple;
 
        Assert(ItemPointerIsValid(otid));
 
@@ -2797,7 +2957,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
         * Note that we get a copy here, so we need not worry about relcache flush
         * happening midway through.
         */
-       hot_attrs = RelationGetIndexAttrBitmap(relation);
+       hot_attrs = RelationGetIndexAttrBitmap(relation, false);
+       key_attrs = RelationGetIndexAttrBitmap(relation, true);
 
        block = ItemPointerGetBlockNumber(otid);
        buffer = ReadBuffer(relation, block);
@@ -2821,6 +2982,44 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
        oldtup.t_len = ItemIdGetLength(lp);
        oldtup.t_self = *otid;
 
+       /*
+        * If we're not updating any "key" column, we can grab a weaker lock type.
+        * This allows for more concurrency when we are running simultaneously with
+        * foreign key checks.
+        *
+        * Note that if a column gets detoasted while executing the update, but the
+        * value ends up being the same, this test will fail and we will use the
+        * stronger lock.  This is acceptable; the important case to optimize is
+        * updates that don't manipulate key columns, not those that
+        * serendipitiously arrive at the same key values.
+        */
+       HeapSatisfiesHOTandKeyUpdate(relation, hot_attrs, key_attrs,
+                                                                &satisfies_hot, &satisfies_key,
+                                                                &oldtup, newtup);
+       if (satisfies_key)
+       {
+               *lockmode = LockTupleNoKeyExclusive;
+               mxact_status = MultiXactStatusNoKeyUpdate;
+               key_intact = true;
+
+               /*
+                * If this is the first possibly-multixact-able operation in the
+                * current transaction, set my per-backend OldestMemberMXactId setting.
+                * We can be certain that the transaction will never become a member of
+                * any older MultiXactIds than that.  (We have to do this even if we
+                * end up just using our own TransactionId below, since some other
+                * backend could incorporate our XID into a MultiXact immediately
+                * afterwards.)
+                */
+               MultiXactIdSetOldestMember();
+       }
+       else
+       {
+               *lockmode = LockTupleExclusive;
+               mxact_status = MultiXactStatusUpdate;
+               key_intact = false;
+       }
+
        /*
         * Note: beyond this point, use oldtup not otid to refer to old tuple.
         * otid may very well point at newtup->t_self, which we will overwrite
@@ -2829,8 +3028,13 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
         */
 
 l2:
+       checked_lockers = false;
+       locker_remains = false;
        result = HeapTupleSatisfiesUpdate(oldtup.t_data, cid, buffer);
 
+       /* see below about the "no wait" case */
+       Assert(result != HeapTupleBeingUpdated || wait);
+
        if (result == HeapTupleInvisible)
        {
                UnlockReleaseBuffer(buffer);
@@ -2838,11 +3042,26 @@ l2:
        }
        else if (result == HeapTupleBeingUpdated && wait)
        {
-               TransactionId xwait;
+               TransactionId   xwait;
                uint16          infomask;
+               bool            can_continue = false;
+
+               checked_lockers = true;
+
+               /*
+                * XXX note that we don't consider the "no wait" case here.  This
+                * isn't a problem currently because no caller uses that case, but it
+                * should be fixed if such a caller is introduced.  It wasn't a problem
+                * previously because this code would always wait, but now that some
+                * tuple locks do not conflict with one of the lock modes we use, it is
+                * possible that this case is interesting to handle specially.
+                *
+                * This may cause failures with third-party code that calls heap_update
+                * directly.
+                */
 
                /* must copy state data before unlocking buffer */
-               xwait = HeapTupleHeaderGetXmax(oldtup.t_data);
+               xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
                infomask = oldtup.t_data->t_infomask;
 
                LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
@@ -2858,20 +3077,29 @@ l2:
                 */
                if (!have_tuple_lock)
                {
-                       LockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+                       LockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
                        have_tuple_lock = true;
                }
 
                /*
-                * Sleep until concurrent transaction ends.  Note that we don't care
-                * if the locker has an exclusive or shared lock, because we need
-                * exclusive.
+                * Now we have to do something about the existing locker.  If it's a
+                * multi, sleep on it; we might be awakened before it is completely
+                * gone (or even not sleep at all in some cases); we need to preserve
+                * it as locker, unless it is gone completely.
+                *
+                * If it's not a multi, we need to check for sleeping conditions before
+                * actually going to sleep.  If the update doesn't conflict with the
+                * locks, we just continue without sleeping (but making sure it is
+                * preserved).
                 */
-
                if (infomask & HEAP_XMAX_IS_MULTI)
                {
+                       TransactionId   update_xact;
+                       int                             remain;
+
                        /* wait for multixact */
-                       MultiXactIdWait((MultiXactId) xwait);
+                       MultiXactIdWait((MultiXactId) xwait, mxact_status, &remain,
+                                                       infomask);
                        LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
                        /*
@@ -2880,49 +3108,87 @@ l2:
                         * change, and start over if so.
                         */
                        if (!(oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-                               !TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
+                               !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
                                                                         xwait))
                                goto l2;
 
                        /*
-                        * You might think the multixact is necessarily done here, but not
-                        * so: it could have surviving members, namely our own xact or
-                        * other subxacts of this backend.      It is legal for us to update
-                        * the tuple in either case, however (the latter case is
-                        * essentially a situation of upgrading our former shared lock to
-                        * exclusive).  We don't bother changing the on-disk hint bits
-                        * since we are about to overwrite the xmax altogether.
+                        * Note that the multixact may not be done by now.  It could have
+                        * surviving members; our own xact or other subxacts of this
+                        * backend, and also any other concurrent transaction that locked
+                        * the tuple with KeyShare if we only got TupleLockUpdate.  If this
+                        * is the case, we have to be careful to mark the updated tuple
+                        * with the surviving members in Xmax.
+                        *
+                        * Note that there could have been another update in the MultiXact.
+                        * In that case, we need to check whether it committed or aborted.
+                        * If it aborted we are safe to update it again; otherwise there is
+                        * an update conflict, and we have to return HeapTupleUpdated
+                        * below.
+                        *
+                        * In the LockTupleExclusive case, we still need to preserve the
+                        * surviving members: those would include the tuple locks we had
+                        * before this one, which are important to keep in case this
+                        * subxact aborts.
                         */
+                       update_xact = InvalidTransactionId;
+                       if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
+                               update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
+
+                       /* there was no UPDATE in the MultiXact; or it aborted. */
+                       if (!TransactionIdIsValid(update_xact) ||
+                               TransactionIdDidAbort(update_xact))
+                               can_continue = true;
+
+                       locker_remains = remain != 0;
                }
                else
                {
-                       /* wait for regular transaction to end */
-                       XactLockTableWait(xwait);
-                       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-
                        /*
-                        * xwait is done, but if xwait had just locked the tuple then some
-                        * other xact could update this tuple before we get to this point.
-                        * Check for xmax change, and start over if so.
+                        * If it's just a key-share locker, and we're not changing the
+                        * key columns, we don't need to wait for it to end; but we
+                        * need to preserve it as locker.
                         */
-                       if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-                               !TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
-                                                                        xwait))
-                               goto l2;
+                       if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
+                       {
+                               LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
-                       /* Otherwise check if it committed or aborted */
-                       UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
+                               /*
+                                * recheck the locker; if someone else changed the tuple while we
+                                * weren't looking, start over.
+                                */
+                               if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+                                       !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
+                                                                                xwait))
+                                       goto l2;
+
+                               can_continue = true;
+                               locker_remains = true;
+                       }
+                       else
+                       {
+                               /* wait for regular transaction to end */
+                               XactLockTableWait(xwait);
+                               LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+                               /*
+                                * xwait is done, but if xwait had just locked the tuple then some
+                                * other xact could update this tuple before we get to this point.
+                                * Check for xmax change, and start over if so.
+                                */
+                               if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+                                       !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
+                                                                                xwait))
+                                       goto l2;
+
+                               /* Otherwise check if it committed or aborted */
+                               UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
+                               if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
+                                       can_continue = true;
+                       }
                }
 
-               /*
-                * We may overwrite if previous xmax aborted, or if it committed but
-                * only locked the tuple without updating it.
-                */
-               if (oldtup.t_data->t_infomask & (HEAP_XMAX_INVALID |
-                                                                                HEAP_IS_LOCKED))
-                       result = HeapTupleMayBeUpdated;
-               else
-                       result = HeapTupleUpdated;
+               result = can_continue ? HeapTupleMayBeUpdated : HeapTupleUpdated;
        }
 
        if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
@@ -2939,17 +3205,18 @@ l2:
                           result == HeapTupleBeingUpdated);
                Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
                hufd->ctid = oldtup.t_data->t_ctid;
-               hufd->xmax = HeapTupleHeaderGetXmax(oldtup.t_data);
+               hufd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
                if (result == HeapTupleSelfUpdated)
                        hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
                else
                        hufd->cmax = 0;         /* for lack of an InvalidCommandId value */
                UnlockReleaseBuffer(buffer);
                if (have_tuple_lock)
-                       UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+                       UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
                if (vmbuffer != InvalidBuffer)
                        ReleaseBuffer(vmbuffer);
                bms_free(hot_attrs);
+               bms_free(key_attrs);
                return result;
        }
 
@@ -2958,7 +3225,7 @@ l2:
         * visible while we were busy locking the buffer, or during some
         * subsequent window during which we had it unlocked, we'll have to unlock
         * and re-lock, to avoid holding the buffer lock across an I/O.  That's a
-        * bit unfortunate, esepecially since we'll now have to recheck whether
+        * bit unfortunate, especially since we'll now have to recheck whether
         * the tuple has been locked or updated under us, but hopefully it won't
         * happen very often.
         */
@@ -2991,12 +3258,54 @@ l2:
                Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
        }
 
+       /*
+        * If the tuple we're updating is locked, we need to preserve the locking
+        * info in the old tuple's Xmax.  Prepare a new Xmax value for this.
+        */
+       compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
+                                                         oldtup.t_data->t_infomask,
+                                                         oldtup.t_data->t_infomask2,
+                                                         xid, *lockmode, true,
+                                                         &xmax_old_tuple, &infomask_old_tuple,
+                                                         &infomask2_old_tuple);
+
+       /* And also prepare an Xmax value for the new copy of the tuple */
+       if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
+               (checked_lockers && !locker_remains))
+               xmax_new_tuple = InvalidTransactionId;
+       else
+               xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
+
+       if (!TransactionIdIsValid(xmax_new_tuple))
+       {
+               infomask_new_tuple = HEAP_XMAX_INVALID;
+               infomask2_new_tuple = 0;
+       }
+       else
+       {
+               if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
+               {
+                       GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
+                                                                  &infomask2_new_tuple);
+               }
+               else
+               {
+                       infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
+                       infomask2_new_tuple = 0;
+               }
+       }
+
+       /*
+        * Prepare the new tuple with the appropriate initial values of Xmin and
+        * Xmax, as well as initial infomask bits as computed above.
+        */
        newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
        newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
-       newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED);
        HeapTupleHeaderSetXmin(newtup->t_data, xid);
        HeapTupleHeaderSetCmin(newtup->t_data, cid);
-       HeapTupleHeaderSetXmax(newtup->t_data, 0);      /* for cleanliness */
+       newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
+       newtup->t_data->t_infomask2 |= infomask2_new_tuple;
+       HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
        newtup->t_tableOid = RelationGetRelid(relation);
 
        /*
@@ -3035,14 +3344,14 @@ l2:
        if (need_toast || newtupsize > pagefree)
        {
                /* Clear obsolete visibility flags ... */
-               oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
-                                                                          HEAP_XMAX_INVALID |
-                                                                          HEAP_XMAX_IS_MULTI |
-                                                                          HEAP_IS_LOCKED |
-                                                                          HEAP_MOVED);
+               oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+               oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
                HeapTupleClearHotUpdated(&oldtup);
                /* ... and store info about transaction updating this tuple */
-               HeapTupleHeaderSetXmax(oldtup.t_data, xid);
+               Assert(TransactionIdIsValid(xmax_old_tuple));
+               HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
+               oldtup.t_data->t_infomask |= infomask_old_tuple;
+               oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
                HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
                /* temporarily make it look not-updated */
                oldtup.t_data->t_ctid = oldtup.t_self;
@@ -3145,7 +3454,7 @@ l2:
                 * to do a HOT update.  Check if any of the index columns have been
                 * changed.  If not, then HOT update is possible.
                 */
-               if (HeapSatisfiesHOTUpdate(relation, hot_attrs, &oldtup, heaptup))
+               if (satisfies_hot)
                        use_hot_update = true;
        }
        else
@@ -3193,13 +3502,13 @@ l2:
        if (!already_marked)
        {
                /* Clear obsolete visibility flags ... */
-               oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
-                                                                          HEAP_XMAX_INVALID |
-                                                                          HEAP_XMAX_IS_MULTI |
-                                                                          HEAP_IS_LOCKED |
-                                                                          HEAP_MOVED);
+               oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+               oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
                /* ... and store info about transaction updating this tuple */
-               HeapTupleHeaderSetXmax(oldtup.t_data, xid);
+               Assert(TransactionIdIsValid(xmax_old_tuple));
+               HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
+               oldtup.t_data->t_infomask |= infomask_old_tuple;
+               oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
                HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
        }
 
@@ -3229,8 +3538,8 @@ l2:
        /* XLOG stuff */
        if (RelationNeedsWAL(relation))
        {
-               XLogRecPtr      recptr = log_heap_update(relation, buffer, oldtup.t_self,
-                                                                                        newbuf, heaptup,
+               XLogRecPtr      recptr = log_heap_update(relation, buffer,
+                                                                                        newbuf, &oldtup, heaptup,
                                                                                         all_visible_cleared,
                                                                                         all_visible_cleared_new);
 
@@ -3272,7 +3581,7 @@ l2:
         * Release the lmgr tuple lock, if we had it.
         */
        if (have_tuple_lock)
-               UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+               UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
 
        pgstat_count_heap_update(relation, use_hot_update);
 
@@ -3287,13 +3596,14 @@ l2:
        }
 
        bms_free(hot_attrs);
+       bms_free(key_attrs);
 
        return HeapTupleMayBeUpdated;
 }
 
 /*
  * Check if the specified attribute's value is same in both given tuples.
- * Subroutine for HeapSatisfiesHOTUpdate.
+ * Subroutine for HeapSatisfiesHOTandKeyUpdate.
  */
 static bool
 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
@@ -3327,7 +3637,7 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
 
        /*
         * Extract the corresponding values.  XXX this is pretty inefficient if
-        * there are many indexed columns.      Should HeapSatisfiesHOTUpdate do a
+        * there are many indexed columns.      Should HeapSatisfiesHOTandKeyUpdate do a
         * single heap_deform_tuple call on each tuple, instead?  But that doesn't
         * work for system columns ...
         */
@@ -3370,35 +3680,101 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
 }
 
 /*
- * Check if the old and new tuples represent a HOT-safe update. To be able
- * to do a HOT update, we must not have changed any columns used in index
- * definitions.
+ * Check which columns are being updated.
+ *
+ * This simultaneously checks conditions for HOT updates and for FOR KEY
+ * SHARE updates.  Since much of the time they will be checking very similar
+ * sets of columns, and doing the same tests on them, it makes sense to
+ * optimize and do them together.
  *
- * The set of attributes to be checked is passed in (we dare not try to
- * compute it while holding exclusive buffer lock...)  NOTE that hot_attrs
- * is destructively modified!  That is OK since this is invoked at most once
- * by heap_update().
+ * We receive two bitmapsets comprising the two sets of columns we're
+ * interested in.  Note these are destructively modified; that is OK since
+ * this is invoked at most once in heap_update.
  *
- * Returns true if safe to do HOT update.
+ * hot_result is set to TRUE if it's okay to do a HOT update (i.e. it does not
+ * modified indexed columns); key_result is set to TRUE if the update does not
+ * modify columns used in the key.
  */
-static bool
-HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
-                                          HeapTuple oldtup, HeapTuple newtup)
+static void
+HeapSatisfiesHOTandKeyUpdate(Relation relation,
+                                                        Bitmapset *hot_attrs, Bitmapset *key_attrs,
+                                                        bool *satisfies_hot, bool *satisfies_key,
+                                                        HeapTuple oldtup, HeapTuple newtup)
 {
-       int                     attrnum;
+       int             next_hot_attnum;
+       int             next_key_attnum;
+       bool    hot_result = true;
+       bool    key_result = true;
+       bool    key_done = false;
+       bool    hot_done = false;
+
+       next_hot_attnum = bms_first_member(hot_attrs);
+       if (next_hot_attnum == -1)
+               hot_done = true;
+       else
+               /* Adjust for system attributes */
+               next_hot_attnum += FirstLowInvalidHeapAttributeNumber;
 
-       while ((attrnum = bms_first_member(hot_attrs)) >= 0)
-       {
+       next_key_attnum = bms_first_member(key_attrs);
+       if (next_key_attnum == -1)
+               key_done = true;
+       else
                /* Adjust for system attributes */
-               attrnum += FirstLowInvalidHeapAttributeNumber;
+               next_key_attnum += FirstLowInvalidHeapAttributeNumber;
 
-               /* If the attribute value has changed, we can't do HOT update */
-               if (!heap_tuple_attr_equals(RelationGetDescr(relation), attrnum,
-                                                                       oldtup, newtup))
-                       return false;
+       for (;;)
+       {
+               int             check_now;
+               bool    changed;
+
+               /* both bitmapsets are now empty */
+               if (key_done && hot_done)
+                       break;
+
+               /* XXX there's probably an easier way ... */
+               if (hot_done)
+                       check_now = next_key_attnum;
+               if (key_done)
+                       check_now = next_hot_attnum;
+               else
+                       check_now = Min(next_hot_attnum, next_key_attnum);
+
+               changed = !heap_tuple_attr_equals(RelationGetDescr(relation),
+                                                                                 check_now, oldtup, newtup);
+               if (changed)
+               {
+                       if (check_now == next_hot_attnum)
+                               hot_result = false;
+                       if (check_now == next_key_attnum)
+                               key_result = false;
+               }
+
+               /* if both are false now, we can stop checking */
+               if (!hot_result && !key_result)
+                       break;
+
+               if (check_now == next_hot_attnum)
+               {
+                       next_hot_attnum = bms_first_member(hot_attrs);
+                       if (next_hot_attnum == -1)
+                               hot_done = true;
+                       else
+                               /* Adjust for system attributes */
+                               next_hot_attnum += FirstLowInvalidHeapAttributeNumber;
+               }
+               if (check_now == next_key_attnum)
+               {
+                       next_key_attnum = bms_first_member(key_attrs);
+                       if (next_key_attnum == -1)
+                               key_done = true;
+                       else
+                               /* Adjust for system attributes */
+                               next_key_attnum += FirstLowInvalidHeapAttributeNumber;
+               }
        }
 
-       return true;
+       *satisfies_hot = hot_result;
+       *satisfies_key = key_result;
 }
 
 /*
@@ -3414,11 +3790,12 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
 {
        HTSU_Result result;
        HeapUpdateFailureData hufd;
+       LockTupleMode lockmode;
 
        result = heap_update(relation, otid, tup,
                                                 GetCurrentCommandId(true), InvalidSnapshot,
                                                 true /* wait for commit */,
-                                                &hufd);
+                                                &hufd, &lockmode);
        switch (result)
        {
                case HeapTupleSelfUpdated:
@@ -3440,6 +3817,28 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
        }
 }
 
+
+/*
+ * Return the MultiXactStatus corresponding to the given tuple lock mode.
+ */
+static MultiXactStatus
+get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
+{
+       MultiXactStatus         retval;
+
+       if (is_update)
+               retval = tupleLockExtraInfo[mode].updstatus;
+       else
+               retval = tupleLockExtraInfo[mode].lockstatus;
+
+       if (retval == -1)
+               elog(ERROR, "invalid lock tuple mode %d/%s", mode,
+                        is_update ? "true" : "false");
+
+       return retval;
+}
+
+
 /*
  *     heap_lock_tuple - lock a tuple in shared or exclusive mode
  *
@@ -3452,6 +3851,8 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
  *             tuple's cmax if lock is successful)
  *     mode: indicates if shared or exclusive tuple lock is desired
  *     nowait: if true, ereport rather than blocking if lock not available
+ *     follow_updates: if true, follow the update chain to also lock descendant
+ *             tuples.
  *
  * Output parameters:
  *     *tuple: all fields filled in
@@ -3464,61 +3865,30 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
  *     HeapTupleUpdated: lock failed because tuple updated by other xact
  *
  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
- * t_xmax, and t_cmax (the last only for HeapTupleSelfUpdated, since we
+ * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
+ * (the last only for HeapTupleSelfUpdated, since we
  * cannot obtain cmax from a combocid generated by another transaction).
  * See comments for struct HeapUpdateFailureData for additional info.
  *
- *
- * NOTES: because the shared-memory lock table is of finite size, but users
- * could reasonably want to lock large numbers of tuples, we do not rely on
- * the standard lock manager to store tuple-level locks over the long term.
- * Instead, a tuple is marked as locked by setting the current transaction's
- * XID as its XMAX, and setting additional infomask bits to distinguish this
- * usage from the more normal case of having deleted the tuple.  When
- * multiple transactions concurrently share-lock a tuple, the first locker's
- * XID is replaced in XMAX with a MultiTransactionId representing the set of
- * XIDs currently holding share-locks.
- *
- * When it is necessary to wait for a tuple-level lock to be released, the
- * basic delay is provided by XactLockTableWait or MultiXactIdWait on the
- * contents of the tuple's XMAX.  However, that mechanism will release all
- * waiters concurrently, so there would be a race condition as to which
- * waiter gets the tuple, potentially leading to indefinite starvation of
- * some waiters.  The possibility of share-locking makes the problem much
- * worse --- a steady stream of share-lockers can easily block an exclusive
- * locker forever.     To provide more reliable semantics about who gets a
- * tuple-level lock first, we use the standard lock manager.  The protocol
- * for waiting for a tuple-level lock is really
- *             LockTuple()
- *             XactLockTableWait()
- *             mark tuple as locked by me
- *             UnlockTuple()
- * When there are multiple waiters, arbitration of who is to get the lock next
- * is provided by LockTuple(). However, at most one tuple-level lock will
- * be held or awaited per backend at any time, so we don't risk overflow
- * of the lock table.  Note that incoming share-lockers are required to
- * do LockTuple as well, if there is any conflict, to ensure that they don't
- * starve out waiting exclusive-lockers.  However, if there is not any active
- * conflict for a tuple, we don't incur any extra overhead.
+ * See README.tuplock for a thorough explanation of this mechanism.
  */
 HTSU_Result
 heap_lock_tuple(Relation relation, HeapTuple tuple,
                                CommandId cid, LockTupleMode mode, bool nowait,
+                               bool follow_updates,
                                Buffer *buffer, HeapUpdateFailureData *hufd)
 {
        HTSU_Result result;
        ItemPointer tid = &(tuple->t_self);
        ItemId          lp;
        Page            page;
-       TransactionId xid;
-       TransactionId xmax;
-       uint16          old_infomask;
-       uint16          new_infomask;
-       LOCKMODE        tuple_lock_type;
+       TransactionId xid,
+                               xmax;
+       uint16          old_infomask,
+                               new_infomask,
+                               new_infomask2;
        bool            have_tuple_lock = false;
 
-       tuple_lock_type = (mode == LockTupleShared) ? ShareLock : ExclusiveLock;
-
        *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
        LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
 
@@ -3542,30 +3912,58 @@ l3:
        {
                TransactionId xwait;
                uint16          infomask;
+               uint16          infomask2;
+               bool            require_sleep;
+               ItemPointerData t_ctid;
 
                /* must copy state data before unlocking buffer */
-               xwait = HeapTupleHeaderGetXmax(tuple->t_data);
+               xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
                infomask = tuple->t_data->t_infomask;
+               infomask2 = tuple->t_data->t_infomask2;
+               ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
 
                LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
 
                /*
-                * If we wish to acquire share lock, and the tuple is already
-                * share-locked by a multixact that includes any subtransaction of the
-                * current top transaction, then we effectively hold the desired lock
-                * already.  We *must* succeed without trying to take the tuple lock,
-                * else we will deadlock against anyone waiting to acquire exclusive
-                * lock.  We don't need to make any state changes in this case.
+                * If any subtransaction of the current top transaction already holds a
+                * lock as strong or stronger than what we're requesting, we
+                * effectively hold the desired lock already.  We *must* succeed
+                * without trying to take the tuple lock, else we will deadlock against
+                * anyone wanting to acquire a stronger lock.
                 */
-               if (mode == LockTupleShared &&
-                       (infomask & HEAP_XMAX_IS_MULTI) &&
-                       MultiXactIdIsCurrent((MultiXactId) xwait))
+               if (infomask & HEAP_XMAX_IS_MULTI)
                {
-                       Assert(infomask & HEAP_XMAX_SHARED_LOCK);
-                       /* Probably can't hold tuple lock here, but may as well check */
-                       if (have_tuple_lock)
-                               UnlockTuple(relation, tid, tuple_lock_type);
-                       return HeapTupleMayBeUpdated;
+                       int             i;
+                       int             nmembers;
+                       MultiXactMember *members;
+
+                       /*
+                        * We don't need to allow old multixacts here; if that had been the
+                        * case, HeapTupleSatisfiesUpdate would have returned MayBeUpdated
+                        * and we wouldn't be here.
+                        */
+                       nmembers = GetMultiXactIdMembers(xwait, &members, false);
+
+                       for (i = 0; i < nmembers; i++)
+                       {
+                               if (TransactionIdIsCurrentTransactionId(members[i].xid))
+                               {
+                                       LockTupleMode   membermode;
+
+                                       membermode = TUPLOCK_from_mxstatus(members[i].status);
+
+                                       if (membermode >= mode)
+                                       {
+                                               if (have_tuple_lock)
+                                                       UnlockTupleTuplock(relation, tid, mode);
+
+                                               pfree(members);
+                                               return HeapTupleMayBeUpdated;
+                                       }
+                               }
+                       }
+
+                       pfree(members);
                }
 
                /*
@@ -3581,255 +3979,435 @@ l3:
                {
                        if (nowait)
                        {
-                               if (!ConditionalLockTuple(relation, tid, tuple_lock_type))
+                               if (!ConditionalLockTupleTuplock(relation, tid, mode))
                                        ereport(ERROR,
                                                        (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
-                                       errmsg("could not obtain lock on row in relation \"%s\"",
-                                                  RelationGetRelationName(relation))));
+                                                        errmsg("could not obtain lock on row in relation \"%s\"",
+                                                                       RelationGetRelationName(relation))));
                        }
                        else
-                               LockTuple(relation, tid, tuple_lock_type);
+                               LockTupleTuplock(relation, tid, mode);
                        have_tuple_lock = true;
                }
 
-               if (mode == LockTupleShared && (infomask & HEAP_XMAX_SHARED_LOCK))
+               /*
+                * Initially assume that we will have to wait for the locking
+                * transaction(s) to finish.  We check various cases below in which
+                * this can be turned off.
+                */
+               require_sleep = true;
+               if (mode == LockTupleKeyShare)
                {
                        /*
-                        * Acquiring sharelock when there's at least one sharelocker
-                        * already.  We need not wait for him/them to complete.
-                        */
-                       LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
-
-                       /*
-                        * Make sure it's still a shared lock, else start over.  (It's OK
-                        * if the ownership of the shared lock has changed, though.)
+                        * If we're requesting KeyShare, and there's no update present, we
+                        * don't need to wait.  Even if there is an update, we can still
+                        * continue if the key hasn't been modified.
+                        *
+                        * However, if there are updates, we need to walk the update chain
+                        * to mark future versions of the row as locked, too.  That way, if
+                        * somebody deletes that future version, we're protected against
+                        * the key going away.  This locking of future versions could block
+                        * momentarily, if a concurrent transaction is deleting a key; or
+                        * it could return a value to the effect that the transaction
+                        * deleting the key has already committed.  So we do this before
+                        * re-locking the buffer; otherwise this would be prone to
+                        * deadlocks.
+                        *
+                        * Note that the TID we're locking was grabbed before we unlocked
+                        * the buffer.  For it to change while we're not looking, the other
+                        * properties we're testing for below after re-locking the buffer
+                        * would also change, in which case we would restart this loop
+                        * above.
                         */
-                       if (!(tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK))
-                               goto l3;
-               }
-               else if (infomask & HEAP_XMAX_IS_MULTI)
-               {
-                       /* wait for multixact to end */
-                       if (nowait)
+                       if (!(infomask2 & HEAP_KEYS_UPDATED))
                        {
-                               if (!ConditionalMultiXactIdWait((MultiXactId) xwait))
-                                       ereport(ERROR,
-                                                       (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
-                                       errmsg("could not obtain lock on row in relation \"%s\"",
-                                                  RelationGetRelationName(relation))));
-                       }
-                       else
-                               MultiXactIdWait((MultiXactId) xwait);
+                               bool    updated;
 
-                       LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+                               updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
 
-                       /*
-                        * If xwait had just locked the tuple then some other xact could
-                        * update this tuple before we get to this point. Check for xmax
-                        * change, and start over if so.
-                        */
-                       if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-                               !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
-                                                                        xwait))
-                               goto l3;
+                               /*
+                                * If there are updates, follow the update chain; bail out
+                                * if that cannot be done.
+                                */
+                               if (follow_updates && updated)
+                               {
+                                       HTSU_Result             res;
+
+                                       res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+                                                                                                 GetCurrentTransactionId(),
+                                                                                                 mode);
+                                       if (res != HeapTupleMayBeUpdated)
+                                       {
+                                               result = res;
+                                               /* recovery code expects to have buffer lock held */
+                                               LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+                                               goto failed;
+                                       }
+                               }
 
-                       /*
-                        * You might think the multixact is necessarily done here, but not
-                        * so: it could have surviving members, namely our own xact or
-                        * other subxacts of this backend.      It is legal for us to lock the
-                        * tuple in either case, however.  We don't bother changing the
-                        * on-disk hint bits since we are about to overwrite the xmax
-                        * altogether.
-                        */
+                               LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+
+                               /*
+                                * Make sure it's still an appropriate lock, else start over.
+                                * Also, if it wasn't updated before we released the lock, but
+                                * is updated now, we start over too; the reason is that we now
+                                * need to follow the update chain to lock the new versions.
+                                */
+                               if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
+                                       ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
+                                        !updated))
+                                       goto l3;
+
+                               /* Things look okay, so we can skip sleeping */
+                               require_sleep = false;
+
+                               /*
+                                * Note we allow Xmax to change here; other updaters/lockers
+                                * could have modified it before we grabbed the buffer lock.
+                                * However, this is not a problem, because with the recheck we
+                                * just did we ensure that they still don't conflict with the
+                                * lock we want.
+                                */
+                       }
                }
-               else
+               else if (mode == LockTupleShare)
                {
-                       /* wait for regular transaction to end */
-                       if (nowait)
+                       /*
+                        * If we're requesting Share, we can similarly avoid sleeping if
+                        * there's no update and no exclusive lock present.
+                        */
+                       if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
+                               !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
                        {
-                               if (!ConditionalXactLockTableWait(xwait))
-                                       ereport(ERROR,
-                                                       (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
-                                       errmsg("could not obtain lock on row in relation \"%s\"",
-                                                  RelationGetRelationName(relation))));
-                       }
-                       else
-                               XactLockTableWait(xwait);
-
-                       LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+                               LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
 
+                               /*
+                                * Make sure it's still an appropriate lock, else start over.
+                                * See above about allowing xmax to change.
+                                */
+                               if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
+                                       HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
+                                       goto l3;
+                               require_sleep = false;
+                       }
+               }
+               else if (mode == LockTupleNoKeyExclusive)
+               {
                        /*
-                        * xwait is done, but if xwait had just locked the tuple then some
-                        * other xact could update this tuple before we get to this point.
-                        * Check for xmax change, and start over if so.
+                        * If we're requesting NoKeyExclusive, we might also be able to
+                        * avoid sleeping; just ensure that there's no other lock type than
+                        * KeyShare.  Note that this is a bit more involved than just
+                        * checking hint bits -- we need to expand the multixact to figure
+                        * out lock modes for each one (unless there was only one such
+                        * locker).
                         */
-                       if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-                               !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
-                                                                        xwait))
-                               goto l3;
+                       if (infomask & HEAP_XMAX_IS_MULTI)
+                       {
+                               int             nmembers;
+                               MultiXactMember *members;
 
-                       /* Otherwise check if it committed or aborted */
-                       UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
+                               /*
+                                * We don't need to allow old multixacts here; if that had been
+                                * the case, HeapTupleSatisfiesUpdate would have returned
+                                * MayBeUpdated and we wouldn't be here.
+                                */
+                               nmembers = GetMultiXactIdMembers(xwait, &members, false);
+
+                               if (nmembers <= 0)
+                               {
+                                       /*
+                                        * No need to keep the previous xmax here. This is unlikely
+                                        * to happen.
+                                        */
+                                       require_sleep = false;
+                               }
+                               else
+                               {
+                                       int             i;
+                                       bool    allowed = true;
+
+                                       for (i = 0; i < nmembers; i++)
+                                       {
+                                               if (members[i].status != MultiXactStatusForKeyShare)
+                                               {
+                                                       allowed = false;
+                                                       break;
+                                               }
+                                       }
+                                       if (allowed)
+                                       {
+                                               /*
+                                                * if the xmax changed under us in the meantime, start
+                                                * over.
+                                                */
+                                               LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+                                               if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+                                                       !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
+                                                                                                xwait))
+                                               {
+                                                       pfree(members);
+                                                       goto l3;
+                                               }
+                                               /* otherwise, we're good */
+                                               require_sleep = false;
+                                       }
+
+                                       pfree(members);
+                               }
+                       }
+                       else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
+                       {
+                               LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+
+                               /* if the xmax changed in the meantime, start over */
+                               if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+                                       !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
+                                                                                xwait))
+                                       goto l3;
+                               /* otherwise, we're good */
+                               require_sleep = false;
+                       }
                }
 
                /*
-                * We may lock if previous xmax aborted, or if it committed but only
-                * locked the tuple without updating it.  The case where we didn't
-                * wait because we are joining an existing shared lock is correctly
-                * handled, too.
+                * By here, we either have already acquired the buffer exclusive lock,
+                * or we must wait for the locking transaction or multixact; so below
+                * we ensure that we grab buffer lock after the sleep.
                 */
-               if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID |
-                                                                                HEAP_IS_LOCKED))
-                       result = HeapTupleMayBeUpdated;
-               else
-                       result = HeapTupleUpdated;
-       }
 
-       if (result != HeapTupleMayBeUpdated)
-       {
-               Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
-               Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
-               hufd->ctid = tuple->t_data->t_ctid;
-               hufd->xmax = HeapTupleHeaderGetXmax(tuple->t_data);
-               if (result == HeapTupleSelfUpdated)
-                       hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
-               else
-                       hufd->cmax = 0;         /* for lack of an InvalidCommandId value */
-               LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
-               if (have_tuple_lock)
-                       UnlockTuple(relation, tid, tuple_lock_type);
-               return result;
-       }
+               if (require_sleep)
+               {
+                       if (infomask & HEAP_XMAX_IS_MULTI)
+                       {
+                               MultiXactStatus status = get_mxact_status_for_lock(mode, false);
 
-       /*
-        * We might already hold the desired lock (or stronger), possibly under a
-        * different subtransaction of the current top transaction.  If so, there
-        * is no need to change state or issue a WAL record.  We already handled
-        * the case where this is true for xmax being a MultiXactId, so now check
-        * for cases where it is a plain TransactionId.
-        *
-        * Note in particular that this covers the case where we already hold
-        * exclusive lock on the tuple and the caller only wants shared lock. It
-        * would certainly not do to give up the exclusive lock.
-        */
-       xmax = HeapTupleHeaderGetXmax(tuple->t_data);
-       old_infomask = tuple->t_data->t_infomask;
-
-       if (!(old_infomask & (HEAP_XMAX_INVALID |
-                                                 HEAP_XMAX_COMMITTED |
-                                                 HEAP_XMAX_IS_MULTI)) &&
-               (mode == LockTupleShared ?
-                (old_infomask & HEAP_IS_LOCKED) :
-                (old_infomask & HEAP_XMAX_EXCL_LOCK)) &&
-               TransactionIdIsCurrentTransactionId(xmax))
-       {
-               LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
-               /* Probably can't hold tuple lock here, but may as well check */
-               if (have_tuple_lock)
-                       UnlockTuple(relation, tid, tuple_lock_type);
-               return HeapTupleMayBeUpdated;
-       }
+                               /* We only ever lock tuples, never update them */
+                               if (status >= MultiXactStatusNoKeyUpdate)
+                                       elog(ERROR, "invalid lock mode in heap_lock_tuple");
 
-       /*
-        * Compute the new xmax and infomask to store into the tuple.  Note we do
-        * not modify the tuple just yet, because that would leave it in the wrong
-        * state if multixact.c elogs.
-        */
-       xid = GetCurrentTransactionId();
-
-       new_infomask = old_infomask & ~(HEAP_XMAX_COMMITTED |
-                                                                       HEAP_XMAX_INVALID |
-                                                                       HEAP_XMAX_IS_MULTI |
-                                                                       HEAP_IS_LOCKED |
-                                                                       HEAP_MOVED);
+                               /* wait for multixact to end */
+                               if (nowait)
+                               {
+                                       if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
+                                                                                                       status, NULL, infomask))
+                                               ereport(ERROR,
+                                                               (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+                                                                errmsg("could not obtain lock on row in relation \"%s\"",
+                                                                               RelationGetRelationName(relation))));
+                               }
+                               else
+                                       MultiXactIdWait((MultiXactId) xwait, status, NULL, infomask);
 
-       if (mode == LockTupleShared)
-       {
-               /*
-                * If this is the first acquisition of a shared lock in the current
-                * transaction, set my per-backend OldestMemberMXactId setting. We can
-                * be certain that the transaction will never become a member of any
-                * older MultiXactIds than that.  (We have to do this even if we end
-                * up just using our own TransactionId below, since some other backend
-                * could incorporate our XID into a MultiXact immediately afterwards.)
-                */
-               MultiXactIdSetOldestMember();
+                               /* if there are updates, follow the update chain */
+                               if (follow_updates &&
+                                       !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
+                               {
+                                       HTSU_Result             res;
+
+                                       res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+                                                                                                 GetCurrentTransactionId(),
+                                                                                                 mode);
+                                       if (res != HeapTupleMayBeUpdated)
+                                       {
+                                               result = res;
+                                               /* recovery code expects to have buffer lock held */
+                                               LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+                                               goto failed;
+                                       }
+                               }
 
-               new_infomask |= HEAP_XMAX_SHARED_LOCK;
+                               LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
 
-               /*
-                * Check to see if we need a MultiXactId because there are multiple
-                * lockers.
-                *
-                * HeapTupleSatisfiesUpdate will have set the HEAP_XMAX_INVALID bit if
-                * the xmax was a MultiXactId but it was not running anymore. There is
-                * a race condition, which is that the MultiXactId may have finished
-                * since then, but that uncommon case is handled within
-                * MultiXactIdExpand.
-                *
-                * There is a similar race condition possible when the old xmax was a
-                * regular TransactionId.  We test TransactionIdIsInProgress again
-                * just to narrow the window, but it's still possible to end up
-                * creating an unnecessary MultiXactId.  Fortunately this is harmless.
-                */
-               if (!(old_infomask & (HEAP_XMAX_INVALID | HEAP_XMAX_COMMITTED)))
-               {
-                       if (old_infomask & HEAP_XMAX_IS_MULTI)
-                       {
                                /*
-                                * If the XMAX is already a MultiXactId, then we need to
-                                * expand it to include our own TransactionId.
+                                * If xwait had just locked the tuple then some other xact
+                                * could update this tuple before we get to this point. Check
+                                * for xmax change, and start over if so.
                                 */
-                               xid = MultiXactIdExpand((MultiXactId) xmax, xid);
-                               new_infomask |= HEAP_XMAX_IS_MULTI;
-                       }
-                       else if (TransactionIdIsInProgress(xmax))
-                       {
+                               if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+                                       !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
+                                                                                xwait))
+                                       goto l3;
+
                                /*
-                                * If the XMAX is a valid TransactionId, then we need to
-                                * create a new MultiXactId that includes both the old locker
-                                * and our own TransactionId.
+                                * Of course, the multixact might not be done here: if we're
+                                * requesting a light lock mode, other transactions with light
+                                * locks could still be alive, as well as locks owned by our
+                                * own xact or other subxacts of this backend.  We need to
+                                * preserve the surviving MultiXact members.  Note that it
+                                * isn't absolutely necessary in the latter case, but doing so
+                                * is simpler.
                                 */
-                               xid = MultiXactIdCreate(xmax, xid);
-                               new_infomask |= HEAP_XMAX_IS_MULTI;
                        }
                        else
                        {
+                               /* wait for regular transaction to end */
+                               if (nowait)
+                               {
+                                       if (!ConditionalXactLockTableWait(xwait))
+                                               ereport(ERROR,
+                                                               (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+                                                                errmsg("could not obtain lock on row in relation \"%s\"",
+                                                                               RelationGetRelationName(relation))));
+                               }
+                               else
+                                       XactLockTableWait(xwait);
+
+                               /* if there are updates, follow the update chain */
+                               if (follow_updates &&
+                                       !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
+                               {
+                                       HTSU_Result             res;
+
+                                       res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+                                                                                                 GetCurrentTransactionId(),
+                                                                                                 mode);
+                                       if (res != HeapTupleMayBeUpdated)
+                                       {
+                                               result = res;
+                                               /* recovery code expects to have buffer lock held */
+                                               LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+                                               goto failed;
+                                       }
+                               }
+
+                               LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+
                                /*
-                                * Can get here iff HeapTupleSatisfiesUpdate saw the old xmax
-                                * as running, but it finished before
-                                * TransactionIdIsInProgress() got to run.      Treat it like
-                                * there's no locker in the tuple.
+                                * xwait is done, but if xwait had just locked the tuple then
+                                * some other xact could update this tuple before we get to
+                                * this point.  Check for xmax change, and start over if so.
                                 */
+                               if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+                                       !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
+                                                                                xwait))
+                                       goto l3;
+
+                               /*
+                                * Otherwise check if it committed or aborted.  Note we cannot
+                                * be here if the tuple was only locked by somebody who didn't
+                                * conflict with us; that should have been handled above.  So
+                                * that transaction must necessarily be gone by now.
+                                */
+                               UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
                        }
                }
+
+               /* By here, we're certain that we hold buffer exclusive lock again */
+
+               /*
+                * We may lock if previous xmax aborted, or if it committed but only
+                * locked the tuple without updating it; or if we didn't have to wait
+                * at all for whatever reason.
+                */
+               if (!require_sleep ||
+                       (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
+                       HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
+                       HeapTupleHeaderIsOnlyLocked(tuple->t_data))
+                       result = HeapTupleMayBeUpdated;
                else
-               {
-                       /*
-                        * There was no previous locker, so just insert our own
-                        * TransactionId.
-                        */
-               }
+                       result = HeapTupleUpdated;
        }
-       else
+
+failed:
+       if (result != HeapTupleMayBeUpdated)
+       {
+               Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
+               Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
+               hufd->ctid = tuple->t_data->t_ctid;
+               hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
+               if (result == HeapTupleSelfUpdated)
+                       hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
+               else
+                       hufd->cmax = 0;         /* for lack of an InvalidCommandId value */
+               LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
+               if (have_tuple_lock)
+                       UnlockTupleTuplock(relation, tid, mode);
+               return result;
+       }
+
+       xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
+       old_infomask = tuple->t_data->t_infomask;
+
+       /*
+        * We might already hold the desired lock (or stronger), possibly under a
+        * different subtransaction of the current top transaction.  If so, there
+        * is no need to change state or issue a WAL record.  We already handled
+        * the case where this is true for xmax being a MultiXactId, so now check
+        * for cases where it is a plain TransactionId.
+        *
+        * Note in particular that this covers the case where we already hold
+        * exclusive lock on the tuple and the caller only wants key share or share
+        * lock. It would certainly not do to give up the exclusive lock.
+        */
+       if (!(old_infomask & (HEAP_XMAX_INVALID |
+                                                 HEAP_XMAX_COMMITTED |
+                                                 HEAP_XMAX_IS_MULTI)) &&
+               (mode == LockTupleKeyShare ?
+                (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask) ||
+                 HEAP_XMAX_IS_SHR_LOCKED(old_infomask) ||
+                 HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) :
+                mode == LockTupleShare ?
+                (HEAP_XMAX_IS_SHR_LOCKED(old_infomask) ||
+                 HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) :
+                (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))) &&
+               TransactionIdIsCurrentTransactionId(xmax))
        {
-               /* We want an exclusive lock on the tuple */
-               new_infomask |= HEAP_XMAX_EXCL_LOCK;
+               LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
+               /* Probably can't hold tuple lock here, but may as well check */
+               if (have_tuple_lock)
+                       UnlockTupleTuplock(relation, tid, mode);
+               return HeapTupleMayBeUpdated;
        }
 
+       /*
+        * If this is the first possibly-multixact-able operation in the
+        * current transaction, set my per-backend OldestMemberMXactId setting.
+        * We can be certain that the transaction will never become a member of
+        * any older MultiXactIds than that.  (We have to do this even if we
+        * end up just using our own TransactionId below, since some other
+        * backend could incorporate our XID into a MultiXact immediately
+        * afterwards.)
+        */
+       MultiXactIdSetOldestMember();
+
+       /*
+        * Compute the new xmax and infomask to store into the tuple.  Note we do
+        * not modify the tuple just yet, because that would leave it in the wrong
+        * state if multixact.c elogs.
+        */
+       compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
+                                                         GetCurrentTransactionId(), mode, false,
+                                                         &xid, &new_infomask, &new_infomask2);
+
        START_CRIT_SECTION();
 
        /*
         * Store transaction information of xact locking the tuple.
         *
         * Note: Cmax is meaningless in this context, so don't set it; this avoids
-        * possibly generating a useless combo CID.
+        * possibly generating a useless combo CID.  Moreover, if we're locking a
+        * previously updated tuple, it's important to preserve the Cmax.
+        *
+        * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
+        * we would break the HOT chain.
         */
-       tuple->t_data->t_infomask = new_infomask;
-       HeapTupleHeaderClearHotUpdated(tuple->t_data);
+       tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
+       tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+       tuple->t_data->t_infomask |= new_infomask;
+       tuple->t_data->t_infomask2 |= new_infomask2;
+       if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
+               HeapTupleHeaderClearHotUpdated(tuple->t_data);
        HeapTupleHeaderSetXmax(tuple->t_data, xid);
-       /* Make sure there is no forward chain link in t_ctid */
-       tuple->t_data->t_ctid = *tid;
+
+       /*
+        * Make sure there is no forward chain link in t_ctid.  Note that in the
+        * cases where the tuple has been updated, we must not overwrite t_ctid,
+        * because it was set by the updater.  Moreover, if the tuple has been
+        * updated, we need to follow the update chain to lock the new versions
+        * of the tuple as well.
+        */
+       if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
+               tuple->t_data->t_ctid = *tid;
 
        MarkBufferDirty(*buffer);
 
@@ -3854,8 +4432,8 @@ l3:
                xlrec.target.node = relation->rd_node;
                xlrec.target.tid = tuple->t_self;
                xlrec.locking_xid = xid;
-               xlrec.xid_is_mxact = ((new_infomask & HEAP_XMAX_IS_MULTI) != 0);
-               xlrec.shared_lock = (mode == LockTupleShared);
+               xlrec.infobits_set = compute_infobits(new_infomask,
+                                                                                         tuple->t_data->t_infomask2);
                rdata[0].data = (char *) &xlrec;
                rdata[0].len = SizeOfHeapLock;
                rdata[0].buffer = InvalidBuffer;
@@ -3887,8 +4465,469 @@ l3:
         * release the lmgr tuple lock, if we had it.
         */
        if (have_tuple_lock)
-               UnlockTuple(relation, tid, tuple_lock_type);
+               UnlockTupleTuplock(relation, tid, mode);
+
+       return HeapTupleMayBeUpdated;
+}
+
+
+/*
+ * Given an original set of Xmax and infomask, and a transaction (identified by
+ * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
+ * corresponding infomasks to use on the tuple.
+ *
+ * Note that this might have side effects such as creating a new MultiXactId.
+ *
+ * Most callers will have called HeapTupleSatisfiesUpdate before this function;
+ * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
+ * but it was not running anymore. There is a race condition, which is that the
+ * MultiXactId may have finished since then, but that uncommon case is handled
+ * either here, or within MultiXactIdExpand.
+ *
+ * There is a similar race condition possible when the old xmax was a regular
+ * TransactionId.  We test TransactionIdIsInProgress again just to narrow the
+ * window, but it's still possible to end up creating an unnecessary
+ * MultiXactId.  Fortunately this is harmless.
+ */
+static void
+compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
+                                                 uint16 old_infomask2, TransactionId add_to_xmax,
+                                                 LockTupleMode mode, bool is_update,
+                                                 TransactionId *result_xmax, uint16 *result_infomask,
+                                                 uint16 *result_infomask2)
+{
+       TransactionId   new_xmax;
+       uint16                  new_infomask,
+                                       new_infomask2;
+
+l5:
+       new_infomask = 0;
+       new_infomask2 = 0;
+       if (old_infomask & HEAP_XMAX_INVALID)
+       {
+               /*
+                * No previous locker; we just insert our own TransactionId.
+                */
+               if (is_update)
+               {
+                       new_xmax = add_to_xmax;
+                       if (mode == LockTupleExclusive)
+                               new_infomask2 |= HEAP_KEYS_UPDATED;
+               }
+               else
+               {
+                       new_infomask |= HEAP_XMAX_LOCK_ONLY;
+                       switch (mode)
+                       {
+                               case LockTupleKeyShare:
+                                       new_xmax = add_to_xmax;
+                                       new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
+                                       break;
+                               case LockTupleShare:
+                                       new_xmax = add_to_xmax;
+                                       new_infomask |= HEAP_XMAX_SHR_LOCK;
+                                       break;
+                               case LockTupleNoKeyExclusive:
+                                       new_xmax = add_to_xmax;
+                                       new_infomask |= HEAP_XMAX_EXCL_LOCK;
+                                       break;
+                               case LockTupleExclusive:
+                                       new_xmax = add_to_xmax;
+                                       new_infomask |= HEAP_XMAX_EXCL_LOCK;
+                                       new_infomask2 |= HEAP_KEYS_UPDATED;
+                                       break;
+                               default:
+                                       new_xmax = InvalidTransactionId;        /* silence compiler */
+                                       elog(ERROR, "invalid lock mode");
+                       }
+               }
+       }
+       else if (old_infomask & HEAP_XMAX_IS_MULTI)
+       {
+               MultiXactStatus         new_status;
+
+               /*
+                * Currently we don't allow XMAX_COMMITTED to be set for multis,
+                * so cross-check.
+                */
+               Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
+
+               /*
+                * A multixact together with LOCK_ONLY set but neither lock bit set
+                * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
+                * anymore.  This check is critical for databases upgraded by
+                * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
+                * that such multis are never passed.
+                */
+               if (!(old_infomask & HEAP_LOCK_MASK) &&
+                       HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
+               {
+                       old_infomask &= ~HEAP_XMAX_IS_MULTI;
+                       old_infomask |= HEAP_XMAX_INVALID;
+                       goto l5;
+               }
+
+               /*
+                * If the XMAX is already a MultiXactId, then we need to expand it to
+                * include add_to_xmax; but if all the members were lockers and are all
+                * gone, we can do away with the IS_MULTI bit and just set add_to_xmax
+                * as the only locker/updater.  If all lockers are gone and we have an
+                * updater that aborted, we can also do without a multi.
+                *
+                * The cost of doing GetMultiXactIdMembers would be paid by
+                * MultiXactIdExpand if we weren't to do this, so this check is not
+                * incurring extra work anyhow.
+                */
+               if (!MultiXactIdIsRunning(xmax))
+               {
+                       if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
+                               TransactionIdDidAbort(MultiXactIdGetUpdateXid(xmax,
+                                                                                                                         old_infomask)))
+                       {
+                               /*
+                                * Reset these bits and restart; otherwise fall through to
+                                * create a new multi below.
+                                */
+                               old_infomask &= ~HEAP_XMAX_IS_MULTI;
+                               old_infomask |= HEAP_XMAX_INVALID;
+                               goto l5;
+                       }
+               }
+
+               new_status = get_mxact_status_for_lock(mode, is_update);
+
+               new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
+                                                                        new_status);
+               GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
+       }
+       else if (old_infomask & HEAP_XMAX_COMMITTED)
+       {
+               /*
+                * It's a committed update, so we need to preserve him as updater of
+                * the tuple.
+                */
+               MultiXactStatus         status;
+               MultiXactStatus         new_status;
+
+               if (old_infomask2 & HEAP_KEYS_UPDATED)
+                       status = MultiXactStatusUpdate;
+               else
+                       status = MultiXactStatusNoKeyUpdate;
+
+               new_status = get_mxact_status_for_lock(mode, is_update);
+               /*
+                * since it's not running, it's obviously impossible for the old
+                * updater to be identical to the current one, so we need not check
+                * for that case as we do in the block above.
+                */
+               new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
+               GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
+       }
+       else if (TransactionIdIsInProgress(xmax))
+       {
+               /*
+                * If the XMAX is a valid, in-progress TransactionId, then we need to
+                * create a new MultiXactId that includes both the old locker or
+                * updater and our own TransactionId.
+                */
+               MultiXactStatus         status;
+               MultiXactStatus         new_status;
+
+               if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
+               {
+                       if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
+                               status = MultiXactStatusForKeyShare;
+                       else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
+                               status = MultiXactStatusForShare;
+                       else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
+                       {
+                               if (old_infomask2 & HEAP_KEYS_UPDATED)
+                                       status = MultiXactStatusForUpdate;
+                               else
+                                       status = MultiXactStatusForNoKeyUpdate;
+                       }
+                       else
+                       {
+                               /*
+                                * LOCK_ONLY can be present alone only when a page has been
+                                * upgraded by pg_upgrade.  But in that case,
+                                * TransactionIdIsInProgress() should have returned false.  We
+                                * assume it's no longer locked in this case.
+                                */
+                               elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
+                               old_infomask |= HEAP_XMAX_INVALID;
+                               old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
+                               goto l5;
+                       }
+               }
+               else
+               {
+                       /* it's an update, but which kind? */
+                       if (old_infomask2 & HEAP_KEYS_UPDATED)
+                               status = MultiXactStatusUpdate;
+                       else
+                               status = MultiXactStatusNoKeyUpdate;
+               }
+
+               new_status = get_mxact_status_for_lock(mode, is_update);
+
+               /*
+                * If the existing lock mode is identical to or weaker than the new
+                * one, we can act as though there is no existing lock, so set
+                * XMAX_INVALID and restart.
+                */
+               if (xmax == add_to_xmax)
+               {
+                       LockTupleMode   old_mode = TUPLOCK_from_mxstatus(status);
+                       bool                    old_isupd = ISUPDATE_from_mxstatus(status);
+
+                       /*
+                        * We can do this if the new LockTupleMode is higher or equal than
+                        * the old one; and if there was previously an update, we need an
+                        * update, but if there wasn't, then we can accept there not being
+                        * one.
+                        */
+                       if ((mode >= old_mode) && (is_update || !old_isupd))
+                       {
+                               /*
+                                * Note that the infomask might contain some other dirty bits.
+                                * However, since the new infomask is reset to zero, we only
+                                * set what's minimally necessary, and that the case that
+                                * checks HEAP_XMAX_INVALID is the very first above, there is
+                                * no need for extra cleanup of the infomask here.
+                                */
+                               old_infomask |= HEAP_XMAX_INVALID;
+                               goto l5;
+                       }
+               }
+               new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
+               GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
+       }
+       else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
+                        TransactionIdDidCommit(xmax))
+       {
+               /*
+                * It's a committed update, so we gotta preserve him as updater of the
+                * tuple.
+                */
+               MultiXactStatus         status;
+               MultiXactStatus         new_status;
+
+               if (old_infomask2 & HEAP_KEYS_UPDATED)
+                       status = MultiXactStatusUpdate;
+               else
+                       status = MultiXactStatusNoKeyUpdate;
+
+               new_status = get_mxact_status_for_lock(mode, is_update);
+               /*
+                * since it's not running, it's obviously impossible for the old
+                * updater to be identical to the current one, so we need not check
+                * for that case as we do in the block above.
+                */
+               new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
+               GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
+       }
+       else
+       {
+               /*
+                * Can get here iff the locking/updating transaction was running when
+                * the infomask was extracted from the tuple, but finished before
+                * TransactionIdIsInProgress got to run.  Deal with it as if there was
+                * no locker at all in the first place.
+                */
+               old_infomask |= HEAP_XMAX_INVALID;
+               goto l5;
+       }
+
+       *result_infomask = new_infomask;
+       *result_infomask2 = new_infomask2;
+       *result_xmax = new_xmax;
+}
+
+
+/*
+ * Recursive part of heap_lock_updated_tuple
+ *
+ * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
+ * xid with the given mode; if this tuple is updated, recurse to lock the new
+ * version as well.
+ */
+static HTSU_Result
+heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
+                                                       LockTupleMode mode)
+{
+       ItemPointerData tupid;
+       HeapTupleData   mytup;
+       Buffer                  buf;
+       uint16                  new_infomask,
+                                       new_infomask2,
+                                       old_infomask;
+       TransactionId   xmax,
+                                       new_xmax;
+
+       ItemPointerCopy(tid, &tupid);
+
+       for (;;)
+       {
+               new_infomask = 0;
+               new_xmax = InvalidTransactionId;
+               ItemPointerCopy(&tupid, &(mytup.t_self));
+
+               if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL))
+                       elog(ERROR, "unable to fetch updated version of tuple");
+
+l4:
+               CHECK_FOR_INTERRUPTS();
+               LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+               old_infomask = mytup.t_data->t_infomask;
+               xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
+
+               /*
+                * If this tuple is updated and the key has been modified (or deleted),
+                * what we do depends on the status of the updating transaction: if
+                * it's live, we sleep until it finishes; if it has committed, we have
+                * to fail (i.e. return HeapTupleUpdated); if it aborted, we ignore it.
+                * For updates that didn't touch the key, we can just plough ahead.
+                */
+               if (!(old_infomask & HEAP_XMAX_INVALID) &&
+                       (mytup.t_data->t_infomask2 & HEAP_KEYS_UPDATED))
+               {
+                       TransactionId   update_xid;
+
+                       /*
+                        * Note: we *must* check TransactionIdIsInProgress before
+                        * TransactionIdDidAbort/Commit; see comment at top of tqual.c for
+                        * an explanation.
+                        */
+                       update_xid = HeapTupleHeaderGetUpdateXid(mytup.t_data);
+                       if (TransactionIdIsCurrentTransactionId(update_xid))
+                       {
+                               UnlockReleaseBuffer(buf);
+                               return HeapTupleSelfUpdated;
+                       }
+                       else if (TransactionIdIsInProgress(update_xid))
+                       {
+                               LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+                               /* No LockTupleTuplock here -- see heap_lock_updated_tuple */
+                               XactLockTableWait(update_xid);
+                               goto l4;
+                       }
+                       else if (TransactionIdDidAbort(update_xid))
+                               ;       /* okay to proceed */
+                       else if (TransactionIdDidCommit(update_xid))
+                       {
+                               UnlockReleaseBuffer(buf);
+                               return HeapTupleUpdated;
+                       }
+               }
+
+               /* compute the new Xmax and infomask values for the tuple ... */
+               compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
+                                                                 xid, mode, false,
+                                                                 &new_xmax, &new_infomask, &new_infomask2);
+
+               START_CRIT_SECTION();
+
+               /* ... and set them */
+               HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
+               mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
+               mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+               mytup.t_data->t_infomask |= new_infomask;
+               mytup.t_data->t_infomask2 |= new_infomask2;
+
+               MarkBufferDirty(buf);
+
+               /* XLOG stuff */
+               if (RelationNeedsWAL(rel))
+               {
+                       xl_heap_lock_updated xlrec;
+                       XLogRecPtr      recptr;
+                       XLogRecData     rdata[2];
+                       Page            page = BufferGetPage(buf);
+
+                       xlrec.target.node = rel->rd_node;
+                       xlrec.target.tid = mytup.t_self;
+                       xlrec.xmax = new_xmax;
+                       xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
+
+                       rdata[0].data = (char *) &xlrec;
+                       rdata[0].len = SizeOfHeapLockUpdated;
+                       rdata[0].buffer = InvalidBuffer;
+                       rdata[0].next = &(rdata[1]);
+
+                       rdata[1].data = NULL;
+                       rdata[1].len = 0;
+                       rdata[1].buffer = buf;
+                       rdata[1].buffer_std = true;
+                       rdata[1].next = NULL;
+
+                       recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED, rdata);
+
+                       PageSetLSN(page, recptr);
+                       PageSetTLI(page, ThisTimeLineID);
+               }
+
+               END_CRIT_SECTION();
+
+               /* if we find the end of update chain, we're done. */
+               if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
+                       ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid)  ||
+                       HeapTupleHeaderIsOnlyLocked(mytup.t_data))
+               {
+                       UnlockReleaseBuffer(buf);
+                       return HeapTupleMayBeUpdated;
+               }
 
+               /* tail recursion */
+               ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
+               UnlockReleaseBuffer(buf);
+       }
+}
+
+/*
+ * heap_lock_updated_tuple
+ *             Follow update chain when locking an updated tuple, acquiring locks (row
+ *             marks) on the updated versions.
+ *
+ * The initial tuple is assumed to be already locked.
+ *
+ * This function doesn't check visibility, it just inconditionally marks the
+ * tuple(s) as locked.  If any tuple in the updated chain is being deleted
+ * concurrently (or updated with the key being modified), sleep until the
+ * transaction doing it is finished.
+ *
+ * Note that we don't acquire heavyweight tuple locks on the tuples we walk
+ * when we have to wait for other transactions to release them, as opposed to
+ * what heap_lock_tuple does.  The reason is that having more than one
+ * transaction walking the chain is probably uncommon enough that risk of
+ * starvation is not likely: one of the preconditions for being here is that
+ * the snapshot in use predates the update that created this tuple (because we
+ * started at an earlier version of the tuple), but at the same time such a
+ * transaction cannot be using repeatable read or serializable isolation
+ * levels, because that would lead to a serializability failure.
+ */
+static HTSU_Result
+heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
+                                               TransactionId xid, LockTupleMode mode)
+{
+       if (!ItemPointerEquals(&tuple->t_self, ctid))
+       {
+               /*
+                * If this is the first possibly-multixact-able operation in the
+                * current transaction, set my per-backend OldestMemberMXactId setting.
+                * We can be certain that the transaction will never become a member of
+                * any older MultiXactIds than that.  (We have to do this even if we
+                * end up just using our own TransactionId below, since some other
+                * backend could incorporate our XID into a MultiXact immediately
+                * afterwards.)
+                */
+               MultiXactIdSetOldestMember();
+
+               return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
+       }
+
+       /* nothing to lock */
        return HeapTupleMayBeUpdated;
 }
 
@@ -4010,6 +5049,9 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
  * because this function is applied during WAL recovery, when we don't have
  * access to any such state, and can't depend on the hint bits to be set.)
  *
+ * Similarly, cutoff_multi must be less than or equal to the smallest
+ * MultiXactId used by any transaction currently open.
+ *
  * If the tuple is in a shared buffer, caller must hold an exclusive lock on
  * that buffer.
  *
@@ -4023,7 +5065,8 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
  * infomask bits.
  */
 bool
-heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid)
+heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
+                                 MultiXactId cutoff_multi)
 {
        bool            changed = false;
        TransactionId xid;
@@ -4043,43 +5086,29 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid)
                changed = true;
        }
 
-       if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+       /*
+        * Note that this code handles IS_MULTI Xmax values, too, but only to mark
+        * the tuple frozen if the updating Xid in the mxact is below the freeze
+        * cutoff; it doesn't remove dead members of a very old multixact.
+        */
+       xid = HeapTupleHeaderGetRawXmax(tuple);
+       if (TransactionIdIsNormal(xid) &&
+               (((!(tuple->t_infomask & HEAP_XMAX_IS_MULTI) &&
+                  TransactionIdPrecedes(xid, cutoff_xid))) ||
+                MultiXactIdPrecedes(xid, cutoff_multi)))
        {
-               xid = HeapTupleHeaderGetXmax(tuple);
-               if (TransactionIdIsNormal(xid) &&
-                       TransactionIdPrecedes(xid, cutoff_xid))
-               {
-                       HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
+               HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
 
-                       /*
-                        * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED
-                        * + LOCKED.  Normalize to INVALID just to be sure no one gets
-                        * confused.
-                        */
-                       tuple->t_infomask &= ~HEAP_XMAX_COMMITTED;
-                       tuple->t_infomask |= HEAP_XMAX_INVALID;
-                       HeapTupleHeaderClearHotUpdated(tuple);
-                       changed = true;
-               }
-       }
-       else
-       {
-               /*----------
-                * XXX perhaps someday we should zero out very old MultiXactIds here?
-                *
-                * The only way a stale MultiXactId could pose a problem is if a
-                * tuple, having once been multiply-share-locked, is not touched by
-                * any vacuum or attempted lock or deletion for just over 4G MultiXact
-                * creations, and then in the probably-narrow window where its xmax
-                * is again a live MultiXactId, someone tries to lock or delete it.
-                * Even then, another share-lock attempt would work fine.  An
-                * exclusive-lock or delete attempt would face unexpected delay, or
-                * in the very worst case get a deadlock error.  This seems an
-                * extremely low-probability scenario with minimal downside even if
-                * it does happen, so for now we don't do the extra bookkeeping that
-                * would be needed to clean out MultiXactIds.
-                *----------
+               /*
+                * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED
+                * + LOCKED.  Normalize to INVALID just to be sure no one gets
+                * confused.  Also get rid of the HEAP_KEYS_UPDATED bit.
                 */
+               tuple->t_infomask &= ~HEAP_XMAX_BITS;
+               tuple->t_infomask |= HEAP_XMAX_INVALID;
+               HeapTupleHeaderClearHotUpdated(tuple);
+               tuple->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+               changed = true;
        }
 
        /*
@@ -4115,18 +5144,269 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid)
        return changed;
 }
 
+/*
+ * For a given MultiXactId, return the hint bits that should be set in the
+ * tuple's infomask.
+ *
+ * Normally this should be called for a multixact that was just created, and
+ * so is on our local cache, so the GetMembers call is fast.
+ */
+static void
+GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
+                                          uint16 *new_infomask2)
+{
+       int             nmembers;
+       MultiXactMember *members;
+       int             i;
+       uint16  bits = HEAP_XMAX_IS_MULTI;
+       uint16  bits2 = 0;
+       bool    has_update = false;
+
+       /*
+        * We only use this in multis we just created, so they cannot be values
+        * pre-pg_upgrade.
+        */
+       nmembers = GetMultiXactIdMembers(multi, &members, false);
+
+       for (i = 0; i < nmembers; i++)
+       {
+               switch (members[i].status)
+               {
+                       case MultiXactStatusForKeyShare:
+                               bits |= HEAP_XMAX_KEYSHR_LOCK;
+                               break;
+                       case MultiXactStatusForShare:
+                               bits |= HEAP_XMAX_SHR_LOCK;
+                               break;
+                       case MultiXactStatusForNoKeyUpdate:
+                               bits |= HEAP_XMAX_EXCL_LOCK;
+                               break;
+                       case MultiXactStatusForUpdate:
+                               bits |= HEAP_XMAX_EXCL_LOCK;
+                               bits2 |= HEAP_KEYS_UPDATED;
+                               break;
+                       case MultiXactStatusNoKeyUpdate:
+                               bits |= HEAP_XMAX_EXCL_LOCK;
+                               has_update = true;
+                               break;
+                       case MultiXactStatusUpdate:
+                               bits |= HEAP_XMAX_EXCL_LOCK;
+                               bits2 |= HEAP_KEYS_UPDATED;
+                               has_update = true;
+                               break;
+               }
+       }
+       if (!has_update)
+               bits |= HEAP_XMAX_LOCK_ONLY;
+
+       if (nmembers > 0)
+               pfree(members);
+
+       *new_infomask = bits;
+       *new_infomask2 = bits2;
+}
+
+/*
+ * MultiXactIdGetUpdateXid
+ *
+ * Given a multixact Xmax and corresponding infomask, which does not have the
+ * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
+ * transaction.
+ */
+static TransactionId
+MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
+{
+       TransactionId   update_xact = InvalidTransactionId;
+       MultiXactMember *members;
+       int                             nmembers;
+
+       Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
+       Assert(t_infomask & HEAP_XMAX_IS_MULTI);
+
+       /*
+        * Since we know the LOCK_ONLY bit is not set, this cannot be a
+        * multi from pre-pg_upgrade.
+        */
+       nmembers = GetMultiXactIdMembers(xmax, &members, false);
+
+       if (nmembers > 0)
+       {
+               int             i;
+
+               for (i = 0; i < nmembers; i++)
+               {
+                       /* Ignore lockers */
+                       if (members[i].status == MultiXactStatusForKeyShare ||
+                               members[i].status == MultiXactStatusForShare ||
+                               members[i].status == MultiXactStatusForNoKeyUpdate ||
+                               members[i].status == MultiXactStatusForUpdate)
+                               continue;
+
+                       /* ignore aborted transactions */
+                       if (TransactionIdDidAbort(members[i].xid))
+                               continue;
+                       /* there should be at most one non-aborted updater */
+                       Assert(update_xact == InvalidTransactionId);
+                       Assert(members[i].status == MultiXactStatusNoKeyUpdate ||
+                                  members[i].status == MultiXactStatusUpdate);
+                       update_xact = members[i].xid;
+#ifndef USE_ASSERT_CHECKING
+                       /*
+                        * in an assert-enabled build, walk the whole array to ensure
+                        * there's no other updater.
+                        */
+                       break;
+#endif
+               }
+
+               pfree(members);
+       }
+
+       return update_xact;
+}
+
+/*
+ * HeapTupleGetUpdateXid
+ *             As above, but use a HeapTupleHeader
+ *
+ * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
+ * checking the hint bits.
+ */
+TransactionId
+HeapTupleGetUpdateXid(HeapTupleHeader tuple)
+{
+       return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
+                                                                  tuple->t_infomask);
+}
+
+/*
+ * Do_MultiXactIdWait
+ *             Actual implementation for the two functions below.
+ *
+ * We do this by sleeping on each member using XactLockTableWait.  Any
+ * members that belong to the current backend are *not* waited for, however;
+ * this would not merely be useless but would lead to Assert failure inside
+ * XactLockTableWait.  By the time this returns, it is certain that all
+ * transactions *of other backends* that were members of the MultiXactId
+ * that conflict with the requested status are dead (and no new ones can have
+ * been added, since it is not legal to add members to an existing
+ * MultiXactId).
+ *
+ * But by the time we finish sleeping, someone else may have changed the Xmax
+ * of the containing tuple, so the caller needs to iterate on us somehow.
+ *
+ * Note that in case we return false, the number of remaining members is
+ * not to be trusted.
+ */
+static bool
+Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+                                  int *remaining, uint16 infomask, bool nowait)
+{
+       bool            allow_old;
+       bool            result = true;
+       MultiXactMember *members;
+       int                     nmembers;
+       int                     remain = 0;
+
+       allow_old = !(infomask & HEAP_LOCK_MASK) && HEAP_XMAX_IS_LOCKED_ONLY(infomask);
+       nmembers = GetMultiXactIdMembers(multi, &members, allow_old);
+
+       if (nmembers >= 0)
+       {
+               int                     i;
+
+               for (i = 0; i < nmembers; i++)
+               {
+                       TransactionId memxid = members[i].xid;
+                       MultiXactStatus memstatus = members[i].status;
+
+                       if (TransactionIdIsCurrentTransactionId(memxid))
+                       {
+                               remain++;
+                               continue;
+                       }
+
+                       if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
+                                                                        LOCKMODE_from_mxstatus(status)))
+                       {
+                               if (remaining && TransactionIdIsInProgress(memxid))
+                                       remain++;
+                               continue;
+                       }
+
+                       /*
+                        * This member conflicts with our multi, so we have to sleep (or
+                        * return failure, if asked to avoid waiting.)
+                        */
+                       if (nowait)
+                       {
+                               result = ConditionalXactLockTableWait(memxid);
+                               if (!result)
+                                       break;
+                       }
+                       else
+                               XactLockTableWait(memxid);
+               }
+
+               pfree(members);
+       }
+
+       if (remaining)
+               *remaining = remain;
+
+       return result;
+}
+
+/*
+ * MultiXactIdWait
+ *             Sleep on a MultiXactId.
+ *
+ * By the time we finish sleeping, someone else may have changed the Xmax
+ * of the containing tuple, so the caller needs to iterate on us somehow.
+ *
+ * We return (in *remaining, if not NULL) the number of members that are still
+ * running, including any (non-aborted) subtransactions of our own transaction.
+ *
+ */
+static void
+MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+                               int *remaining, uint16 infomask)
+{
+       Do_MultiXactIdWait(multi, status, remaining, infomask, false);
+}
+
+/*
+ * ConditionalMultiXactIdWait
+ *             As above, but only lock if we can get the lock without blocking.
+ *
+ * By the time we finish sleeping, someone else may have changed the Xmax
+ * of the containing tuple, so the caller needs to iterate on us somehow.
+ *
+ * If the multixact is now all gone, return true.  Returns false if some
+ * transactions might still be running.
+ *
+ * We return (in *remaining, if not NULL) the number of members that are still
+ * running, including any (non-aborted) subtransactions of our own transaction.
+ */
+static bool
+ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+                                                  int *remaining, uint16 infomask)
+{
+       return Do_MultiXactIdWait(multi, status, remaining, infomask, true);
+}
+
 /*
  * heap_tuple_needs_freeze
  *
  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
- * are older than the specified cutoff XID.  If so, return TRUE.
+ * are older than the specified cutoff XID or MultiXactId.  If so, return TRUE.
  *
  * It doesn't matter whether the tuple is alive or dead, we are checking
  * to see if a tuple needs to be removed or frozen to avoid wraparound.
  */
 bool
 heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
-                                               Buffer buf)
+                                               MultiXactId cutoff_multi, Buffer buf)
 {
        TransactionId xid;
 
@@ -4135,12 +5415,23 @@ heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
                TransactionIdPrecedes(xid, cutoff_xid))
                return true;
 
-       if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+       if (!(tuple->t_infomask & HEAP_XMAX_INVALID))
        {
-               xid = HeapTupleHeaderGetXmax(tuple);
-               if (TransactionIdIsNormal(xid) &&
-                       TransactionIdPrecedes(xid, cutoff_xid))
-                       return true;
+               if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+               {
+                       xid = HeapTupleHeaderGetRawXmax(tuple);
+                       if (TransactionIdIsNormal(xid) &&
+                               TransactionIdPrecedes(xid, cutoff_xid))
+                               return true;
+               }
+               else
+               {
+                       MultiXactId multi;
+
+                       multi = HeapTupleHeaderGetRawXmax(tuple);
+                       if (MultiXactIdPrecedes(multi, cutoff_multi))
+                               return true;
+               }
        }
 
        if (tuple->t_infomask & HEAP_MOVED)
@@ -4231,7 +5522,7 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
                                                                           TransactionId *latestRemovedXid)
 {
        TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
-       TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
+       TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
        TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
 
        if (tuple->t_infomask & HEAP_MOVED)
@@ -4387,7 +5678,7 @@ log_heap_clean(Relation reln, Buffer buffer,
  */
 XLogRecPtr
 log_heap_freeze(Relation reln, Buffer buffer,
-                               TransactionId cutoff_xid,
+                               TransactionId cutoff_xid, MultiXactId cutoff_multi,
                                OffsetNumber *offsets, int offcnt)
 {
        xl_heap_freeze xlrec;
@@ -4402,6 +5693,7 @@ log_heap_freeze(Relation reln, Buffer buffer,
        xlrec.node = reln->rd_node;
        xlrec.block = BufferGetBlockNumber(buffer);
        xlrec.cutoff_xid = cutoff_xid;
+       xlrec.cutoff_multi = cutoff_multi;
 
        rdata[0].data = (char *) &xlrec;
        rdata[0].len = SizeOfHeapFreeze;
@@ -4463,8 +5755,8 @@ log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer,
  * have modified the buffer(s) and marked them dirty.
  */
 static XLogRecPtr
-log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
-                               Buffer newbuf, HeapTuple newtup,
+log_heap_update(Relation reln, Buffer oldbuf,
+                               Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
                                bool all_visible_cleared, bool new_all_visible_cleared)
 {
        xl_heap_update xlrec;
@@ -4483,7 +5775,11 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
                info = XLOG_HEAP_UPDATE;
 
        xlrec.target.node = reln->rd_node;
-       xlrec.target.tid = from;
+       xlrec.target.tid = oldtup->t_self;
+       xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
+       xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
+                                                                                         oldtup->t_data->t_infomask2);
+       xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
        xlrec.all_visible_cleared = all_visible_cleared;
        xlrec.newtid = newtup->t_self;
        xlrec.new_all_visible_cleared = new_all_visible_cleared;
@@ -4748,6 +6044,7 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
 {
        xl_heap_freeze *xlrec = (xl_heap_freeze *) XLogRecGetData(record);
        TransactionId cutoff_xid = xlrec->cutoff_xid;
+       MultiXactId     cutoff_multi = xlrec->cutoff_multi;
        Buffer          buffer;
        Page            page;
 
@@ -4790,7 +6087,7 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
                        ItemId          lp = PageGetItemId(page, *offsets);
                        HeapTupleHeader tuple = (HeapTupleHeader) PageGetItem(page, lp);
 
-                       (void) heap_freeze_tuple(tuple, cutoff_xid);
+                       (void) heap_freeze_tuple(tuple, cutoff_xid, cutoff_multi);
                        offsets++;
                }
        }
@@ -4937,6 +6234,33 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
        UnlockReleaseBuffer(buffer);
 }
 
+/*
+ * Given an "infobits" field from an XLog record, set the correct bits in the
+ * given infomask and infomask2 for the tuple touched by the record.
+ *
+ * (This is the reverse of compute_infobits).
+ */
+static void
+fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
+{
+       *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
+                                  HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
+       *infomask2 &= ~HEAP_KEYS_UPDATED;
+
+       if (infobits & XLHL_XMAX_IS_MULTI)
+               *infomask |= HEAP_XMAX_IS_MULTI;
+       if (infobits & XLHL_XMAX_LOCK_ONLY)
+               *infomask |= HEAP_XMAX_LOCK_ONLY;
+       if (infobits & XLHL_XMAX_EXCL_LOCK)
+               *infomask |= HEAP_XMAX_EXCL_LOCK;
+       /* note HEAP_XMAX_SHR_LOCK isn't considered here */
+       if (infobits & XLHL_XMAX_KEYSHR_LOCK)
+               *infomask |= HEAP_XMAX_KEYSHR_LOCK;
+
+       if (infobits & XLHL_KEYS_UPDATED)
+               *infomask2 |= HEAP_KEYS_UPDATED;
+}
+
 static void
 heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 {
@@ -4992,13 +6316,12 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 
        htup = (HeapTupleHeader) PageGetItem(page, lp);
 
-       htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
-                                                 HEAP_XMAX_INVALID |
-                                                 HEAP_XMAX_IS_MULTI |
-                                                 HEAP_IS_LOCKED |
-                                                 HEAP_MOVED);
+       htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+       htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
        HeapTupleHeaderClearHotUpdated(htup);
-       HeapTupleHeaderSetXmax(htup, record->xl_xid);
+       fix_infomask_from_infobits(xlrec->infobits_set,
+                                                          &htup->t_infomask, &htup->t_infomask2);
+       HeapTupleHeaderSetXmax(htup, xlrec->xmax);
        HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
 
        /* Mark the page as a candidate for pruning */
@@ -5368,16 +6691,15 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
 
        htup = (HeapTupleHeader) PageGetItem(page, lp);
 
-       htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
-                                                 HEAP_XMAX_INVALID |
-                                                 HEAP_XMAX_IS_MULTI |
-                                                 HEAP_IS_LOCKED |
-                                                 HEAP_MOVED);
+       htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+       htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
        if (hot_update)
                HeapTupleHeaderSetHotUpdated(htup);
        else
                HeapTupleHeaderClearHotUpdated(htup);
-       HeapTupleHeaderSetXmax(htup, record->xl_xid);
+       fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
+                                                          &htup->t_infomask2);
+       HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
        HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
        /* Set forward chain link in t_ctid */
        htup->t_ctid = xlrec->newtid;
@@ -5484,6 +6806,7 @@ newsame:;
 
        HeapTupleHeaderSetXmin(htup, record->xl_xid);
        HeapTupleHeaderSetCmin(htup, FirstCommandId);
+       HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
        /* Make sure there is no forward chain link in t_ctid */
        htup->t_ctid = xlrec->newtid;
 
@@ -5564,17 +6887,8 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
 
        htup = (HeapTupleHeader) PageGetItem(page, lp);
 
-       htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
-                                                 HEAP_XMAX_INVALID |
-                                                 HEAP_XMAX_IS_MULTI |
-                                                 HEAP_IS_LOCKED |
-                                                 HEAP_MOVED);
-       if (xlrec->xid_is_mxact)
-               htup->t_infomask |= HEAP_XMAX_IS_MULTI;
-       if (xlrec->shared_lock)
-               htup->t_infomask |= HEAP_XMAX_SHARED_LOCK;
-       else
-               htup->t_infomask |= HEAP_XMAX_EXCL_LOCK;
+       fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
+                                                          &htup->t_infomask2);
        HeapTupleHeaderClearHotUpdated(htup);
        HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
        HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
@@ -5586,6 +6900,56 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
        UnlockReleaseBuffer(buffer);
 }
 
+static void
+heap_xlog_lock_updated(XLogRecPtr lsn, XLogRecord *record)
+{
+       xl_heap_lock_updated *xlrec =
+               (xl_heap_lock_updated *) XLogRecGetData(record);
+       Buffer          buffer;
+       Page            page;
+       OffsetNumber offnum;
+       ItemId          lp = NULL;
+       HeapTupleHeader htup;
+
+       /* If we have a full-page image, restore it and we're done */
+       if (record->xl_info & XLR_BKP_BLOCK(0))
+       {
+               (void) RestoreBackupBlock(lsn, record, 0, false, false);
+               return;
+       }
+
+       buffer = XLogReadBuffer(xlrec->target.node,
+                                                       ItemPointerGetBlockNumber(&(xlrec->target.tid)),
+                                                       false);
+       if (!BufferIsValid(buffer))
+               return;
+       page = (Page) BufferGetPage(buffer);
+
+       if (lsn <= PageGetLSN(page))            /* changes are applied */
+       {
+               UnlockReleaseBuffer(buffer);
+               return;
+       }
+
+       offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
+       if (PageGetMaxOffsetNumber(page) >= offnum)
+               lp = PageGetItemId(page, offnum);
+
+       if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
+               elog(PANIC, "heap_xlog_lock_updated: invalid lp");
+
+       htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+       fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
+                                                          &htup->t_infomask2);
+       HeapTupleHeaderSetXmax(htup, xlrec->xmax);
+
+       PageSetLSN(page, lsn);
+       PageSetTLI(page, ThisTimeLineID);
+       MarkBufferDirty(buffer);
+       UnlockReleaseBuffer(buffer);
+}
+
 static void
 heap_xlog_inplace(XLogRecPtr lsn, XLogRecord *record)
 {
@@ -5702,6 +7066,9 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
                case XLOG_HEAP2_MULTI_INSERT:
                        heap_xlog_multi_insert(lsn, record);
                        break;
+               case XLOG_HEAP2_LOCK_UPDATED:
+                       heap_xlog_lock_updated(lsn, record);
+                       break;
                default:
                        elog(PANIC, "heap2_redo: unknown op code %u", info);
        }
index 390585bd2eba86f25506adea6b3cba363cf76d2a..3ca332d28f2342752b1372f33ab06cf810a9d6e1 100644 (file)
@@ -463,7 +463,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
                                 * that the page is reconsidered for pruning in future.
                                 */
                                heap_prune_record_prunable(prstate,
-                                                                                  HeapTupleHeaderGetXmax(htup));
+                                                                                  HeapTupleHeaderGetUpdateXid(htup));
                                break;
 
                        case HEAPTUPLE_DELETE_IN_PROGRESS:
@@ -473,7 +473,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
                                 * that the page is reconsidered for pruning in future.
                                 */
                                heap_prune_record_prunable(prstate,
-                                                                                  HeapTupleHeaderGetXmax(htup));
+                                                                                  HeapTupleHeaderGetUpdateXid(htup));
                                break;
 
                        case HEAPTUPLE_LIVE:
@@ -521,7 +521,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
                Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
                           BufferGetBlockNumber(buffer));
                offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
-               priorXmax = HeapTupleHeaderGetXmax(htup);
+               priorXmax = HeapTupleHeaderGetUpdateXid(htup);
        }
 
        /*
@@ -746,7 +746,7 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 
                        /* Set up to scan the HOT-chain */
                        nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
-                       priorXmax = HeapTupleHeaderGetXmax(htup);
+                       priorXmax = HeapTupleHeaderGetUpdateXid(htup);
                }
                else
                {
@@ -787,7 +787,7 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
                                break;
 
                        nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
-                       priorXmax = HeapTupleHeaderGetXmax(htup);
+                       priorXmax = HeapTupleHeaderGetUpdateXid(htup);
                }
        }
 }
index 628e3b1277ec4a517e5a7225c9fcf59d85a19121..84472f80cd9dd1c439ff80d538013e3aa2da205c 100644 (file)
 #include "storage/smgr.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
+#include "utils/tqual.h"
 
 
 /*
@@ -128,6 +129,8 @@ typedef struct RewriteStateData
                                                                                 * determine tuple visibility */
        TransactionId rs_freeze_xid;/* Xid that will be used as freeze cutoff
                                                                 * point */
+       MultiXactId     rs_freeze_multi;/* MultiXactId that will be used as freeze
+                                                                * cutoff point for multixacts */
        MemoryContext rs_cxt;           /* for hash tables and entries and tuples in
                                                                 * them */
        HTAB       *rs_unresolved_tups;         /* unmatched A tuples */
@@ -177,6 +180,7 @@ static void raw_heap_insert(RewriteState state, HeapTuple tup);
  * new_heap            new, locked heap relation to insert tuples to
  * oldest_xmin xid used by the caller to determine which tuples are dead
  * freeze_xid  xid before which tuples will be frozen
+ * freeze_multi multixact before which multis will be frozen
  * use_wal             should the inserts to the new heap be WAL-logged?
  *
  * Returns an opaque RewriteState, allocated in current memory context,
@@ -184,7 +188,8 @@ static void raw_heap_insert(RewriteState state, HeapTuple tup);
  */
 RewriteState
 begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
-                                  TransactionId freeze_xid, bool use_wal)
+                                  TransactionId freeze_xid, MultiXactId freeze_multi,
+                                  bool use_wal)
 {
        RewriteState state;
        MemoryContext rw_cxt;
@@ -213,6 +218,7 @@ begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
        state->rs_use_wal = use_wal;
        state->rs_oldest_xmin = oldest_xmin;
        state->rs_freeze_xid = freeze_xid;
+       state->rs_freeze_multi = freeze_multi;
        state->rs_cxt = rw_cxt;
 
        /* Initialize hash tables used to track update chains */
@@ -337,7 +343,8 @@ rewrite_heap_tuple(RewriteState state,
         * While we have our hands on the tuple, we may as well freeze any
         * very-old xmin or xmax, so that future VACUUM effort can be saved.
         */
-       heap_freeze_tuple(new_tuple->t_data, state->rs_freeze_xid);
+       heap_freeze_tuple(new_tuple->t_data, state->rs_freeze_xid,
+                                         state->rs_freeze_multi);
 
        /*
         * Invalid ctid means that ctid should point to the tuple itself. We'll
@@ -348,15 +355,15 @@ rewrite_heap_tuple(RewriteState state,
        /*
         * If the tuple has been updated, check the old-to-new mapping hash table.
         */
-       if (!(old_tuple->t_data->t_infomask & (HEAP_XMAX_INVALID |
-                                                                                  HEAP_IS_LOCKED)) &&
+       if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
+                 HeapTupleHeaderIsOnlyLocked(old_tuple->t_data)) &&
                !(ItemPointerEquals(&(old_tuple->t_self),
                                                        &(old_tuple->t_data->t_ctid))))
        {
                OldToNewMapping mapping;
 
                memset(&hashkey, 0, sizeof(hashkey));
-               hashkey.xmin = HeapTupleHeaderGetXmax(old_tuple->t_data);
+               hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data);
                hashkey.tid = old_tuple->t_data->t_ctid;
 
                mapping = (OldToNewMapping)
index 3809e5116615ce42d8202be8af8e51de65c7f652..272208417a33c0bac4825eb5f1520d5e834fe6a0 100644 (file)
@@ -25,6 +25,21 @@ out_target(StringInfo buf, xl_heaptid *target)
                                         ItemPointerGetOffsetNumber(&(target->tid)));
 }
 
+static void
+out_infobits(StringInfo buf, uint8 infobits)
+{
+       if (infobits & XLHL_XMAX_IS_MULTI)
+               appendStringInfo(buf, "IS_MULTI ");
+       if (infobits & XLHL_XMAX_LOCK_ONLY)
+               appendStringInfo(buf, "LOCK_ONLY ");
+       if (infobits & XLHL_XMAX_EXCL_LOCK)
+               appendStringInfo(buf, "EXCL_LOCK ");
+       if (infobits & XLHL_XMAX_KEYSHR_LOCK)
+               appendStringInfo(buf, "KEYSHR_LOCK ");
+       if (infobits & XLHL_KEYS_UPDATED)
+               appendStringInfo(buf, "KEYS_UPDATED ");
+}
+
 void
 heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 {
@@ -47,6 +62,8 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 
                appendStringInfo(buf, "delete: ");
                out_target(buf, &(xlrec->target));
+               appendStringInfoChar(buf, ' ');
+               out_infobits(buf, xlrec->infobits_set);
        }
        else if (info == XLOG_HEAP_UPDATE)
        {
@@ -57,9 +74,12 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
                else
                        appendStringInfo(buf, "update: ");
                out_target(buf, &(xlrec->target));
-               appendStringInfo(buf, "; new %u/%u",
+               appendStringInfo(buf, " xmax %u ", xlrec->old_xmax);
+               out_infobits(buf, xlrec->old_infobits_set);
+               appendStringInfo(buf, "; new tid %u/%u xmax %u",
                                                 ItemPointerGetBlockNumber(&(xlrec->newtid)),
-                                                ItemPointerGetOffsetNumber(&(xlrec->newtid)));
+                                                ItemPointerGetOffsetNumber(&(xlrec->newtid)),
+                                                xlrec->new_xmax);
        }
        else if (info == XLOG_HEAP_HOT_UPDATE)
        {
@@ -70,9 +90,12 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
                else
                        appendStringInfo(buf, "hot_update: ");
                out_target(buf, &(xlrec->target));
-               appendStringInfo(buf, "; new %u/%u",
+               appendStringInfo(buf, " xmax %u ", xlrec->old_xmax);
+               out_infobits(buf, xlrec->old_infobits_set);
+               appendStringInfo(buf, "; new tid %u/%u xmax %u",
                                                 ItemPointerGetBlockNumber(&(xlrec->newtid)),
-                                                ItemPointerGetOffsetNumber(&(xlrec->newtid)));
+                                                ItemPointerGetOffsetNumber(&(xlrec->newtid)),
+                                                xlrec->new_xmax);
        }
        else if (info == XLOG_HEAP_NEWPAGE)
        {
@@ -87,16 +110,10 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
        {
                xl_heap_lock *xlrec = (xl_heap_lock *) rec;
 
-               if (xlrec->shared_lock)
-                       appendStringInfo(buf, "shared_lock: ");
-               else
-                       appendStringInfo(buf, "exclusive_lock: ");
-               if (xlrec->xid_is_mxact)
-                       appendStringInfo(buf, "mxid ");
-               else
-                       appendStringInfo(buf, "xid ");
-               appendStringInfo(buf, "%u ", xlrec->locking_xid);
+               appendStringInfo(buf, "lock %u: ", xlrec->locking_xid);
                out_target(buf, &(xlrec->target));
+               appendStringInfoChar(buf, ' ');
+               out_infobits(buf, xlrec->infobits_set);
        }
        else if (info == XLOG_HEAP_INPLACE)
        {
@@ -108,7 +125,6 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
        else
                appendStringInfo(buf, "UNKNOWN");
 }
-
 void
 heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 {
@@ -119,10 +135,10 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
        {
                xl_heap_freeze *xlrec = (xl_heap_freeze *) rec;
 
-               appendStringInfo(buf, "freeze: rel %u/%u/%u; blk %u; cutoff %u",
+               appendStringInfo(buf, "freeze: rel %u/%u/%u; blk %u; cutoff xid %u multi %u",
                                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                                 xlrec->node.relNode, xlrec->block,
-                                                xlrec->cutoff_xid);
+                                                xlrec->cutoff_xid, xlrec->cutoff_multi);
        }
        else if (info == XLOG_HEAP2_CLEAN)
        {
@@ -160,6 +176,14 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
                                xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode,
                                                 xlrec->blkno, xlrec->ntuples);
        }
+       else if (info == XLOG_HEAP2_LOCK_UPDATED)
+       {
+               xl_heap_lock_updated *xlrec = (xl_heap_lock_updated *) rec;
+
+               appendStringInfo(buf, "lock updated: xmax %u msk %04x; ", xlrec->xmax,
+                                                xlrec->infobits_set);
+               out_target(buf, &(xlrec->target));
+       }
        else
                appendStringInfo(buf, "UNKNOWN");
 }
index ddd675f610cc48ab18705c9d727c22406ddcd0ef..3e6cba062d3ed35abf1a3a41880ad6a675170598 100644 (file)
 
 #include "access/multixact.h"
 
+static void
+out_member(StringInfo buf, MultiXactMember *member)
+{
+       appendStringInfo(buf, "%u ", member->xid);
+       switch (member->status)
+       {
+               case MultiXactStatusForKeyShare:
+                       appendStringInfoString(buf, "(keysh) ");
+                       break;
+               case MultiXactStatusForShare:
+                       appendStringInfoString(buf, "(sh) ");
+                       break;
+               case MultiXactStatusForNoKeyUpdate:
+                       appendStringInfoString(buf, "(fornokeyupd) ");
+                       break;
+               case MultiXactStatusForUpdate:
+                       appendStringInfoString(buf, "(forupd) ");
+                       break;
+               case MultiXactStatusNoKeyUpdate:
+                       appendStringInfoString(buf, "(nokeyupd) ");
+                       break;
+               case MultiXactStatusUpdate:
+                       appendStringInfoString(buf, "(upd) ");
+                       break;
+               default:
+                       appendStringInfoString(buf, "(unk) ");
+                       break;
+       }
+}
 
 void
 multixact_desc(StringInfo buf, uint8 xl_info, char *rec)
@@ -41,10 +70,10 @@ multixact_desc(StringInfo buf, uint8 xl_info, char *rec)
                xl_multixact_create *xlrec = (xl_multixact_create *) rec;
                int                     i;
 
-               appendStringInfo(buf, "create multixact %u offset %u:",
-                                                xlrec->mid, xlrec->moff);
-               for (i = 0; i < xlrec->nxids; i++)
-                       appendStringInfo(buf, " %u", xlrec->xids[i]);
+               appendStringInfo(buf, "create mxid %u offset %u nmembers %d: ", xlrec->mid,
+                                                xlrec->moff, xlrec->nmembers);
+               for (i = 0; i < xlrec->nmembers; i++)
+                       out_member(buf, &xlrec->members[i]);
        }
        else
                appendStringInfo(buf, "UNKNOWN");
index ad0abbfe8b343a52b2d51889d2169a432cacf886..506b208c9cfa117b91983d481eb83b0393a79107 100644 (file)
@@ -41,7 +41,8 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
 
                appendStringInfo(buf, "checkpoint: redo %X/%X; "
                                   "tli %u; fpw %s; xid %u/%u; oid %u; multi %u; offset %u; "
-                                                "oldest xid %u in DB %u; oldest running xid %u; %s",
+                                                "oldest xid %u in DB %u; oldest multi %u in DB %u; "
+                                                "oldest running xid %u; %s",
                                                 (uint32) (checkpoint->redo >> 32), (uint32) checkpoint->redo,
                                                 checkpoint->ThisTimeLineID,
                                                 checkpoint->fullPageWrites ? "true" : "false",
@@ -51,6 +52,8 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
                                                 checkpoint->nextMultiOffset,
                                                 checkpoint->oldestXid,
                                                 checkpoint->oldestXidDB,
+                                                checkpoint->oldestMulti,
+                                                checkpoint->oldestMultiDB,
                                                 checkpoint->oldestActiveXid,
                                 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
        }
index 548ddbb4dd535fb0a170549db9dd76c3bf0a44e2..aabcbba49e8220506bf6653edacaa7117d944e7a 100644 (file)
@@ -791,10 +791,10 @@ parent transaction to complete.
 
 Not all transactional behaviour is emulated, for example we do not insert
 a transaction entry into the lock table, nor do we maintain the transaction
-stack in memory. Clog entries are made normally. Multitrans is not maintained
+stack in memory. Clog entries are made normally. Multixact is not maintained
 because its purpose is to record tuple level locks that an application has
-requested to prevent write locks. Since write locks cannot be obtained at all,
-there is never any conflict and so there is no reason to update multitrans.
+requested to prevent other tuple locks. Since tuple locks cannot be obtained at
+all, there is never any conflict and so there is no reason to update multixact.
 Subtrans is maintained during recovery but the details of the transaction
 tree are ignored and all subtransactions reference the top-level TransactionId
 directly. Since commit is atomic this provides correct lock wait behaviour
index 1ae671743c527516276bd8e04a3dd292890820df..9f804f759909d01a9309e5d5211bb9375d7f006f 100644 (file)
@@ -3,12 +3,18 @@
  * multixact.c
  *             PostgreSQL multi-transaction-log manager
  *
- * The pg_multixact manager is a pg_clog-like manager that stores an array
- * of TransactionIds for each MultiXactId.     It is a fundamental part of the
- * shared-row-lock implementation.     A share-locked tuple stores a
- * MultiXactId in its Xmax, and a transaction that needs to wait for the
- * tuple to be unlocked can sleep on the potentially-several TransactionIds
- * that compose the MultiXactId.
+ * The pg_multixact manager is a pg_clog-like manager that stores an array of
+ * MultiXactMember for each MultiXactId.  It is a fundamental part of the
+ * shared-row-lock implementation.  Each MultiXactMember is comprised of a
+ * TransactionId and a set of flag bits.  The name is a bit historical:
+ * originally, a MultiXactId consisted of more than one TransactionId (except
+ * in rare corner cases), hence "multi".  Nowadays, however, it's perfectly
+ * legitimate to have MultiXactIds that only include a single Xid.
+ *
+ * The meaning of the flag bits is opaque to this module, but they are mostly
+ * used in heapam.c to identify lock modes that each of the member transactions
+ * is holding on any given tuple.  This module just contains support to store
+ * and retrieve the arrays.
  *
  * We use two SLRU areas, one for storing the offsets at which the data
  * starts for each MultiXactId in the other one.  This trick allows us to
  * replay, the next-MXID and next-offset counters are at least as large as
  * anything we saw during replay.
  *
+ * We are able to remove segments no longer necessary by carefully tracking
+ * each table's used values: during vacuum, any multixact older than a
+ * certain value is removed; the cutoff value is stored in pg_class.
+ * The minimum value in each database is stored in pg_database, and the
+ * global minimum is part of pg_control.  Any vacuum that is able to
+ * advance its database's minimum value also computes a new global minimum,
+ * and uses this value to truncate older segments.  When new multixactid
+ * values are to be created, care is taken that the counter does not
+ * fall within the wraparound horizon considering the global minimum value.
  *
  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
 #include "access/twophase.h"
 #include "access/twophase_rmgr.h"
 #include "access/xact.h"
+#include "catalog/pg_type.h"
+#include "commands/dbcommands.h"
+#include "funcapi.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "storage/lmgr.h"
+#include "storage/pmsignal.h"
 #include "storage/procarray.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
+#include "utils/snapmgr.h"
 
 
 /*
  * Defines for MultiXactOffset page sizes.     A page is the same BLCKSZ as is
  * used everywhere else in Postgres.
  *
- * Note: because both MultiXactOffsets and TransactionIds are 32 bits and
- * wrap around at 0xFFFFFFFF, MultiXact page numbering also wraps around at
- * 0xFFFFFFFF/MULTIXACT_*_PER_PAGE, and segment numbering at
- * 0xFFFFFFFF/MULTIXACT_*_PER_PAGE/SLRU_SEGMENTS_PER_PAGE.     We need take no
- * explicit notice of that fact in this module, except when comparing segment
- * and page numbers in TruncateMultiXact
- * (see MultiXact{Offset,Member}PagePrecedes).
+ * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
+ * MultiXact page numbering also wraps around at
+ * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
+ * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_SEGMENTS_PER_PAGE.       We need
+ * take no explicit notice of that fact in this module, except when comparing
+ * segment and page numbers in TruncateMultiXact (see
+ * MultiXactOffsetPagePrecedes).
  */
 
-/* We need four bytes per offset and also four bytes per member */
+/* We need four bytes per offset */
 #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
-#define MULTIXACT_MEMBERS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
 
 #define MultiXactIdToOffsetPage(xid) \
        ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
 #define MultiXactIdToOffsetEntry(xid) \
        ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
 
-#define MXOffsetToMemberPage(xid) \
-       ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
-#define MXOffsetToMemberEntry(xid) \
-       ((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
+/*
+ * The situation for members is a bit more complex: we store one byte of
+ * additional flag bits for each TransactionId.  To do this without getting
+ * into alignment issues, we store four bytes of flags, and then the
+ * corresponding 4 Xids.  Each such 5-word (20-byte) set we call a "group", and
+ * are stored as a whole in pages.  Thus, with 8kB BLCKSZ, we keep 409 groups
+ * per page.  This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * performance) trumps space efficiency here.
+ *
+ * Note that the "offset" macros work with byte offset, not array indexes, so
+ * arithmetic must be done using "char *" pointers.
+ */
+/* We need eight bits per xact, so one xact fits in a byte */
+#define MXACT_MEMBER_BITS_PER_XACT                     8
+#define MXACT_MEMBER_FLAGS_PER_BYTE                    1
+#define MXACT_MEMBER_XACT_BITMASK      ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
+
+/* how many full bytes of flags are there in a group? */
+#define MULTIXACT_FLAGBYTES_PER_GROUP          4
+#define MULTIXACT_MEMBERS_PER_MEMBERGROUP      \
+       (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
+/* size in bytes of a complete group */
+#define MULTIXACT_MEMBERGROUP_SIZE \
+       (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERS_PER_PAGE     \
+       (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
+
+/* page in which a member is to be found */
+#define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
+
+/* Location (byte offset within page) of flag word for a given member */
+#define MXOffsetToFlagsOffset(xid) \
+       ((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \
+         (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \
+        (TransactionId) MULTIXACT_MEMBERGROUP_SIZE)
+#define MXOffsetToFlagsBitShift(xid) \
+       (((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \
+        MXACT_MEMBER_BITS_PER_XACT)
+
+/* Location (byte offset within page) of TransactionId of given member */
+#define MXOffsetToMemberOffset(xid) \
+       (MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \
+        ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId))
 
 
 /*
@@ -117,6 +176,19 @@ typedef struct MultiXactStateData
        /* the Offset SLRU area was last truncated at this MultiXactId */
        MultiXactId lastTruncationPoint;
 
+       /*
+        * oldest multixact that is still on disk.  Anything older than this should
+        * not be consulted.
+        */
+       MultiXactId             oldestMultiXactId;
+       Oid                             oldestMultiXactDB;
+
+       /* support for anti-wraparound measures */
+       MultiXactId             multiVacLimit;
+       MultiXactId             multiWarnLimit;
+       MultiXactId             multiStopLimit;
+       MultiXactId             multiWrapLimit;
+
        /*
         * Per-backend data starts here.  We have two arrays stored in the area
         * immediately following the MultiXactStateData struct. Each is indexed by
@@ -180,7 +252,8 @@ static MultiXactId *OldestVisibleMXactId;
  * so they will be uninteresting by the time our next transaction starts.
  * (XXX not clear that this is correct --- other members of the MultiXact
  * could hang around longer than we did.  However, it's not clear what a
- * better policy for flushing old cache entries would be.)
+ * better policy for flushing old cache entries would be.)  FIXME actually
+ * this is plain wrong now that multixact's may contain update Xids.
  *
  * We allocate the cache entries in a memory context that is deleted at
  * transaction end, so we don't need to do retail freeing of entries.
@@ -189,53 +262,52 @@ typedef struct mXactCacheEnt
 {
        struct mXactCacheEnt *next;
        MultiXactId multi;
-       int                     nxids;
-       TransactionId xids[1];          /* VARIABLE LENGTH ARRAY */
+       int                     nmembers;
+       MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
 } mXactCacheEnt;
 
 static mXactCacheEnt *MXactCache = NULL;
 static MemoryContext MXactContext = NULL;
 
-
 #ifdef MULTIXACT_DEBUG
 #define debug_elog2(a,b) elog(a,b)
 #define debug_elog3(a,b,c) elog(a,b,c)
 #define debug_elog4(a,b,c,d) elog(a,b,c,d)
 #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
+#define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
 #else
 #define debug_elog2(a,b)
 #define debug_elog3(a,b,c)
 #define debug_elog4(a,b,c,d)
 #define debug_elog5(a,b,c,d,e)
+#define debug_elog6(a,b,c,d,e,f)
 #endif
 
 /* internal MultiXactId management */
 static void MultiXactIdSetOldestVisible(void);
-static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids);
+static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members);
 static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
-                                  int nxids, TransactionId *xids);
-static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset);
+                                  int nmembers, MultiXactMember *members);
+static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
 
 /* MultiXact cache management */
-static MultiXactId mXactCacheGetBySet(int nxids, TransactionId *xids);
-static int     mXactCacheGetById(MultiXactId multi, TransactionId **xids);
-static void mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids);
+static int mxactMemberComparator(const void *arg1, const void *arg2);
+static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
+static int     mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
+static void mXactCachePut(MultiXactId multi, int nmembers,
+                         MultiXactMember *members);
 
-#ifdef MULTIXACT_DEBUG
-static char *mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids);
-#endif
+static char *mxstatus_to_string(MultiXactStatus status);
 
 /* management of SLRU infrastructure */
 static int     ZeroMultiXactOffsetPage(int pageno, bool writeXlog);
 static int     ZeroMultiXactMemberPage(int pageno, bool writeXlog);
 static bool MultiXactOffsetPagePrecedes(int page1, int page2);
 static bool MultiXactMemberPagePrecedes(int page1, int page2);
-static bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2);
 static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
                                                MultiXactOffset offset2);
 static void ExtendMultiXactOffset(MultiXactId multi);
 static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
-static void TruncateMultiXact(void);
 static void WriteMZeroPageXlogRec(int pageno, uint8 info);
 
 
@@ -243,21 +315,22 @@ static void WriteMZeroPageXlogRec(int pageno, uint8 info);
  * MultiXactIdCreate
  *             Construct a MultiXactId representing two TransactionIds.
  *
- * The two XIDs must be different.
+ * The two XIDs must be different, or be requesting different statuses.
  *
  * NB - we don't worry about our local MultiXactId cache here, because that
  * is handled by the lower-level routines.
  */
 MultiXactId
-MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
+MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
+                                 TransactionId xid2, MultiXactStatus status2)
 {
        MultiXactId newMulti;
-       TransactionId xids[2];
+       MultiXactMember members[2];
 
        AssertArg(TransactionIdIsValid(xid1));
        AssertArg(TransactionIdIsValid(xid2));
 
-       Assert(!TransactionIdEquals(xid1, xid2));
+       Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
 
        /*
         * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
@@ -265,13 +338,15 @@ MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
         * caller just did a check on xid1, so it'd be wasted effort.
         */
 
-       xids[0] = xid1;
-       xids[1] = xid2;
+       members[0].xid = xid1;
+       members[0].status = status1;
+       members[1].xid = xid2;
+       members[1].status = status2;
 
-       newMulti = CreateMultiXactId(2, xids);
+       newMulti = CreateMultiXactId(2, members);
 
-       debug_elog5(DEBUG2, "Create: returning %u for %u, %u",
-                               newMulti, xid1, xid2);
+       debug_elog3(DEBUG2, "Create: %s",
+                               mxid_to_string(newMulti, 2, members));
 
        return newMulti;
 }
@@ -280,22 +355,27 @@ MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
  * MultiXactIdExpand
  *             Add a TransactionId to a pre-existing MultiXactId.
  *
- * If the TransactionId is already a member of the passed MultiXactId,
- * just return it as-is.
+ * If the TransactionId is already a member of the passed MultiXactId with the
+ * same status, just return it as-is.
  *
  * Note that we do NOT actually modify the membership of a pre-existing
  * MultiXactId; instead we create a new one.  This is necessary to avoid
- * a race condition against MultiXactIdWait (see notes there).
+ * a race condition against code trying to wait for one MultiXactId to finish;
+ * see notes in heapam.c.
  *
  * NB - we don't worry about our local MultiXactId cache here, because that
  * is handled by the lower-level routines.
+ *
+ * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
+ * one upgraded by pg_upgrade from a cluster older than this feature) are not
+ * passed in.
  */
 MultiXactId
-MultiXactIdExpand(MultiXactId multi, TransactionId xid)
+MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
 {
        MultiXactId newMulti;
-       TransactionId *members;
-       TransactionId *newMembers;
+       MultiXactMember *members;
+       MultiXactMember *newMembers;
        int                     nmembers;
        int                     i;
        int                     j;
@@ -303,13 +383,20 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
        AssertArg(MultiXactIdIsValid(multi));
        AssertArg(TransactionIdIsValid(xid));
 
-       debug_elog4(DEBUG2, "Expand: received multi %u, xid %u",
-                               multi, xid);
+       debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
+                               multi, xid, mxstatus_to_string(status));
 
-       nmembers = GetMultiXactIdMembers(multi, &members);
+       /*
+        * Note: we don't allow for old multis here.  The reason is that the
+        * only caller of this function does a check that the multixact is
+        * no longer running.
+        */
+       nmembers = GetMultiXactIdMembers(multi, &members, false);
 
        if (nmembers < 0)
        {
+               MultiXactMember         member;
+
                /*
                 * The MultiXactId is obsolete.  This can only happen if all the
                 * MultiXactId members stop running between the caller checking and
@@ -317,7 +404,9 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
                 * caller, but it would complicate the API and it's unlikely to happen
                 * too often, so just deal with it by creating a singleton MultiXact.
                 */
-               newMulti = CreateMultiXactId(1, &xid);
+               member.xid = xid;
+               member.status = status;
+               newMulti = CreateMultiXactId(1, &member);
 
                debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
                                        multi, newMulti);
@@ -325,12 +414,13 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
        }
 
        /*
-        * If the TransactionId is already a member of the MultiXactId, just
-        * return the existing MultiXactId.
+        * If the TransactionId is already a member of the MultiXactId with the
+        * same status, just return the existing MultiXactId.
         */
        for (i = 0; i < nmembers; i++)
        {
-               if (TransactionIdEquals(members[i], xid))
+               if (TransactionIdEquals(members[i].xid, xid) &&
+                       (members[i].status == status))
                {
                        debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
                                                xid, multi);
@@ -340,21 +430,31 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
        }
 
        /*
-        * Determine which of the members of the MultiXactId are still running,
-        * and use them to create a new one.  (Removing dead members is just an
-        * optimization, but a useful one.      Note we have the same race condition
-        * here as above: j could be 0 at the end of the loop.)
+        * Determine which of the members of the MultiXactId are still of interest.
+        * This is any running transaction, and also any transaction that grabbed
+        * something stronger than just a lock and was committed.  (An update that
+        * aborted is of no interest here.)
+        *
+        * (Removing dead members is just an optimization, but a useful one.
+        * Note we have the same race condition here as above: j could be 0 at the
+        * end of the loop.)
         */
-       newMembers = (TransactionId *)
-               palloc(sizeof(TransactionId) * (nmembers + 1));
+       newMembers = (MultiXactMember *)
+               palloc(sizeof(MultiXactMember) * (nmembers + 1));
 
        for (i = 0, j = 0; i < nmembers; i++)
        {
-               if (TransactionIdIsInProgress(members[i]))
-                       newMembers[j++] = members[i];
+               if (TransactionIdIsInProgress(members[i].xid) ||
+                       ((members[i].status > MultiXactStatusForUpdate) &&
+                        TransactionIdDidCommit(members[i].xid)))
+               {
+                       newMembers[j].xid = members[i].xid;
+                       newMembers[j++].status = members[i].status;
+               }
        }
 
-       newMembers[j++] = xid;
+       newMembers[j].xid = xid;
+       newMembers[j++].status = status;
        newMulti = CreateMultiXactId(j, newMembers);
 
        pfree(members);
@@ -372,17 +472,24 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
  * We return true if at least one member of the given MultiXactId is still
  * running.  Note that a "false" result is certain not to change,
  * because it is not legal to add members to an existing MultiXactId.
+ *
+ * Caller is expected to have verified that the multixact does not come from
+ * a pg_upgraded share-locked tuple.
  */
 bool
 MultiXactIdIsRunning(MultiXactId multi)
 {
-       TransactionId *members;
+       MultiXactMember *members;
        int                     nmembers;
        int                     i;
 
        debug_elog3(DEBUG2, "IsRunning %u?", multi);
 
-       nmembers = GetMultiXactIdMembers(multi, &members);
+       /*
+        * "false" here means we assume our callers have checked that the given
+        * multi cannot possibly come from a pg_upgraded database.
+        */
+       nmembers = GetMultiXactIdMembers(multi, &members, false);
 
        if (nmembers < 0)
        {
@@ -391,13 +498,15 @@ MultiXactIdIsRunning(MultiXactId multi)
        }
 
        /*
-        * Checking for myself is cheap compared to looking in shared memory, so
-        * first do the equivalent of MultiXactIdIsCurrent().  This is not needed
-        * for correctness, it's just a fast path.
+        * Checking for myself is cheap compared to looking in shared memory;
+        * return true if any live subtransaction of the current top-level
+        * transaction is a member.
+        *
+        * This is not needed for correctness, it's just a fast path.
         */
        for (i = 0; i < nmembers; i++)
        {
-               if (TransactionIdIsCurrentTransactionId(members[i]))
+               if (TransactionIdIsCurrentTransactionId(members[i].xid))
                {
                        debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
                        pfree(members);
@@ -412,10 +521,10 @@ MultiXactIdIsRunning(MultiXactId multi)
         */
        for (i = 0; i < nmembers; i++)
        {
-               if (TransactionIdIsInProgress(members[i]))
+               if (TransactionIdIsInProgress(members[i].xid))
                {
                        debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
-                                               i, members[i]);
+                                               i, members[i].xid);
                        pfree(members);
                        return true;
                }
@@ -428,55 +537,18 @@ MultiXactIdIsRunning(MultiXactId multi)
        return false;
 }
 
-/*
- * MultiXactIdIsCurrent
- *             Returns true if the current transaction is a member of the MultiXactId.
- *
- * We return true if any live subtransaction of the current top-level
- * transaction is a member.  This is appropriate for the same reason that a
- * lock held by any such subtransaction is globally equivalent to a lock
- * held by the current subtransaction: no such lock could be released without
- * aborting this subtransaction, and hence releasing its locks.  So it's not
- * necessary to add the current subxact to the MultiXact separately.
- */
-bool
-MultiXactIdIsCurrent(MultiXactId multi)
-{
-       bool            result = false;
-       TransactionId *members;
-       int                     nmembers;
-       int                     i;
-
-       nmembers = GetMultiXactIdMembers(multi, &members);
-
-       if (nmembers < 0)
-               return false;
-
-       for (i = 0; i < nmembers; i++)
-       {
-               if (TransactionIdIsCurrentTransactionId(members[i]))
-               {
-                       result = true;
-                       break;
-               }
-       }
-
-       pfree(members);
-
-       return result;
-}
-
 /*
  * MultiXactIdSetOldestMember
  *             Save the oldest MultiXactId this transaction could be a member of.
  *
- * We set the OldestMemberMXactId for a given transaction the first time
- * it's going to acquire a shared lock.  We need to do this even if we end
- * up using a TransactionId instead of a MultiXactId, because there is a
- * chance that another transaction would add our XID to a MultiXactId.
+ * We set the OldestMemberMXactId for a given transaction the first time it's
+ * going to do some operation that might require a MultiXactId (tuple lock,
+ * update or delete).  We need to do this even if we end up using a
+ * TransactionId instead of a MultiXactId, because there is a chance that
+ * another transaction would add our XID to a MultiXactId.
  *
- * The value to set is the next-to-be-assigned MultiXactId, so this is meant
- * to be called just before acquiring a shared lock.
+ * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
+ * be called just before doing any such possibly-MultiXactId-able operation.
  */
 void
 MultiXactIdSetOldestMember(void)
@@ -568,81 +640,23 @@ MultiXactIdSetOldestVisible(void)
 }
 
 /*
- * MultiXactIdWait
- *             Sleep on a MultiXactId.
- *
- * We do this by sleeping on each member using XactLockTableWait.  Any
- * members that belong to the current backend are *not* waited for, however;
- * this would not merely be useless but would lead to Assert failure inside
- * XactLockTableWait.  By the time this returns, it is certain that all
- * transactions *of other backends* that were members of the MultiXactId
- * are dead (and no new ones can have been added, since it is not legal
- * to add members to an existing MultiXactId).
- *
- * But by the time we finish sleeping, someone else may have changed the Xmax
- * of the containing tuple, so the caller needs to iterate on us somehow.
+ * ReadNextMultiXactId
+ *             Return the next MultiXactId to be assigned, but don't allocate it
  */
-void
-MultiXactIdWait(MultiXactId multi)
-{
-       TransactionId *members;
-       int                     nmembers;
-
-       nmembers = GetMultiXactIdMembers(multi, &members);
-
-       if (nmembers >= 0)
-       {
-               int                     i;
-
-               for (i = 0; i < nmembers; i++)
-               {
-                       TransactionId member = members[i];
-
-                       debug_elog4(DEBUG2, "MultiXactIdWait: waiting for %d (%u)",
-                                               i, member);
-                       if (!TransactionIdIsCurrentTransactionId(member))
-                               XactLockTableWait(member);
-               }
-
-               pfree(members);
-       }
-}
-
-/*
- * ConditionalMultiXactIdWait
- *             As above, but only lock if we can get the lock without blocking.
- */
-bool
-ConditionalMultiXactIdWait(MultiXactId multi)
+MultiXactId
+ReadNextMultiXactId(void)
 {
-       bool            result = true;
-       TransactionId *members;
-       int                     nmembers;
-
-       nmembers = GetMultiXactIdMembers(multi, &members);
-
-       if (nmembers >= 0)
-       {
-               int                     i;
+       MultiXactId             mxid;
 
-               for (i = 0; i < nmembers; i++)
-               {
-                       TransactionId member = members[i];
-
-                       debug_elog4(DEBUG2, "ConditionalMultiXactIdWait: trying %d (%u)",
-                                               i, member);
-                       if (!TransactionIdIsCurrentTransactionId(member))
-                       {
-                               result = ConditionalXactLockTableWait(member);
-                               if (!result)
-                                       break;
-                       }
-               }
+       /* XXX we could presumably do this without a lock. */
+       LWLockAcquire(MultiXactGenLock, LW_SHARED);
+       mxid = MultiXactState->nextMXact;
+       LWLockRelease(MultiXactGenLock);
 
-               pfree(members);
-       }
+       if (mxid < FirstMultiXactId)
+               mxid = FirstMultiXactId;
 
-       return result;
+       return mxid;
 }
 
 /*
@@ -652,10 +666,10 @@ ConditionalMultiXactIdWait(MultiXactId multi)
  * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
  * given TransactionIds as members.  Returns the newly created MultiXactId.
  *
- * NB: the passed xids[] array will be sorted in-place.
+ * NB: the passed members[] array will be sorted in-place.
  */
 static MultiXactId
-CreateMultiXactId(int nxids, TransactionId *xids)
+CreateMultiXactId(int nmembers, MultiXactMember *members)
 {
        MultiXactId multi;
        MultiXactOffset offset;
@@ -663,10 +677,10 @@ CreateMultiXactId(int nxids, TransactionId *xids)
        xl_multixact_create xlrec;
 
        debug_elog3(DEBUG2, "Create: %s",
-                               mxid_to_string(InvalidMultiXactId, nxids, xids));
+                               mxid_to_string(InvalidMultiXactId, nmembers, members));
 
        /*
-        * See if the same set of XIDs already exists in our cache; if so, just
+        * See if the same set of members already exists in our cache; if so, just
         * re-use that MultiXactId.  (Note: it might seem that looking in our
         * cache is insufficient, and we ought to search disk to see if a
         * duplicate definition already exists.  But since we only ever create
@@ -675,7 +689,7 @@ CreateMultiXactId(int nxids, TransactionId *xids)
         * corner cases where someone else added us to a MultiXact without our
         * knowledge, but it's not worth checking for.)
         */
-       multi = mXactCacheGetBySet(nxids, xids);
+       multi = mXactCacheGetBySet(nmembers, members);
        if (MultiXactIdIsValid(multi))
        {
                debug_elog2(DEBUG2, "Create: in cache!");
@@ -687,7 +701,7 @@ CreateMultiXactId(int nxids, TransactionId *xids)
         * in the OFFSETs and MEMBERs files.  NB: this routine does
         * START_CRIT_SECTION().
         */
-       multi = GetNewMultiXactId(nxids, &offset);
+       multi = GetNewMultiXactId(nmembers, &offset);
 
        /*
         * Make an XLOG entry describing the new MXID.
@@ -704,27 +718,34 @@ CreateMultiXactId(int nxids, TransactionId *xids)
         */
        xlrec.mid = multi;
        xlrec.moff = offset;
-       xlrec.nxids = nxids;
+       xlrec.nmembers = nmembers;
 
+       /*
+        * XXX Note: there's a lot of padding space in MultiXactMember.  We could
+        * find a more compact representation of this Xlog record -- perhaps all the
+        * status flags in one XLogRecData, then all the xids in another one?  Not
+        * clear that it's worth the trouble though.
+        */
        rdata[0].data = (char *) (&xlrec);
-       rdata[0].len = MinSizeOfMultiXactCreate;
+       rdata[0].len = SizeOfMultiXactCreate;
        rdata[0].buffer = InvalidBuffer;
        rdata[0].next = &(rdata[1]);
-       rdata[1].data = (char *) xids;
-       rdata[1].len = nxids * sizeof(TransactionId);
+
+       rdata[1].data = (char *) members;
+       rdata[1].len = nmembers * sizeof(MultiXactMember);
        rdata[1].buffer = InvalidBuffer;
        rdata[1].next = NULL;
 
        (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID, rdata);
 
        /* Now enter the information into the OFFSETs and MEMBERs logs */
-       RecordNewMultiXact(multi, offset, nxids, xids);
+       RecordNewMultiXact(multi, offset, nmembers, members);
 
        /* Done with critical section */
        END_CRIT_SECTION();
 
        /* Store the new MultiXactId in the local cache, too */
-       mXactCachePut(multi, nxids, xids);
+       mXactCachePut(multi, nmembers, members);
 
        debug_elog2(DEBUG2, "Create: all done");
 
@@ -739,7 +760,7 @@ CreateMultiXactId(int nxids, TransactionId *xids)
  */
 static void
 RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
-                                  int nxids, TransactionId *xids)
+                                  int nmembers, MultiXactMember *members)
 {
        int                     pageno;
        int                     prev_pageno;
@@ -775,12 +796,21 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 
        prev_pageno = -1;
 
-       for (i = 0; i < nxids; i++, offset++)
+       for (i = 0; i < nmembers; i++, offset++)
        {
                TransactionId *memberptr;
+               uint32     *flagsptr;
+               uint32          flagsval;
+               int                     bshift;
+               int                     flagsoff;
+               int                     memberoff;
+
+               Assert(members[i].status <= MultiXactStatusUpdate);
 
                pageno = MXOffsetToMemberPage(offset);
-               entryno = MXOffsetToMemberEntry(offset);
+               memberoff = MXOffsetToMemberOffset(offset);
+               flagsoff = MXOffsetToFlagsOffset(offset);
+               bshift = MXOffsetToFlagsBitShift(offset);
 
                if (pageno != prev_pageno)
                {
@@ -789,10 +819,17 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
                }
 
                memberptr = (TransactionId *)
-                       MultiXactMemberCtl->shared->page_buffer[slotno];
-               memberptr += entryno;
+                       (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+
+               *memberptr = members[i].xid;
+
+               flagsptr = (uint32 *)
+                       (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
 
-               *memberptr = xids[i];
+               flagsval = *flagsptr;
+               flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+               flagsval |= (members[i].status << bshift);
+               *flagsptr = flagsval;
 
                MultiXactMemberCtl->shared->page_dirty[slotno] = true;
        }
@@ -816,27 +853,115 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
  * caller must end the critical section after writing SLRU data.
  */
 static MultiXactId
-GetNewMultiXactId(int nxids, MultiXactOffset *offset)
+GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
 {
        MultiXactId result;
        MultiXactOffset nextOffset;
 
-       debug_elog3(DEBUG2, "GetNew: for %d xids", nxids);
+       debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
 
        /* MultiXactIdSetOldestMember() must have been called already */
        Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
 
+       /* safety check, we should never get this far in a HS slave */
+       if (RecoveryInProgress())
+               elog(ERROR, "cannot assign MultiXactIds during recovery");
+
        LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
 
        /* Handle wraparound of the nextMXact counter */
        if (MultiXactState->nextMXact < FirstMultiXactId)
                MultiXactState->nextMXact = FirstMultiXactId;
 
-       /*
-        * Assign the MXID, and make sure there is room for it in the file.
-        */
+       /* Assign the MXID */
        result = MultiXactState->nextMXact;
 
+       /*----------
+        * Check to see if it's safe to assign another MultiXactId.  This protects
+        * against catastrophic data loss due to multixact wraparound.  The basic
+        * rules are:
+        *
+        * If we're past multiVacLimit, start trying to force autovacuum cycles.
+        * If we're past multiWarnLimit, start issuing warnings.
+        * If we're past multiStopLimit, refuse to create new MultiXactIds.
+        *
+        * Note these are pretty much the same protections in GetNewTransactionId.
+        *----------
+        */
+       if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
+       {
+               /*
+                * For safety's sake, we release MultiXactGenLock while sending
+                * signals, warnings, etc.  This is not so much because we care about
+                * preserving concurrency in this situation, as to avoid any
+                * possibility of deadlock while doing get_database_name(). First,
+                * copy all the shared values we'll need in this path.
+                */
+               MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
+               MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
+               MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
+               Oid                     oldest_datoid = MultiXactState->oldestMultiXactDB;
+
+               LWLockRelease(MultiXactGenLock);
+
+               /*
+                * To avoid swamping the postmaster with signals, we issue the autovac
+                * request only once per 64K transaction starts.  This still gives
+                * plenty of chances before we get into real trouble.
+                */
+               if (IsUnderPostmaster && (result % 65536) == 0)
+                       SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+               if (IsUnderPostmaster &&
+                       !MultiXactIdPrecedes(result, multiStopLimit))
+               {
+                       char       *oldest_datname = get_database_name(oldest_datoid);
+
+                       /* complain even if that DB has disappeared */
+                       if (oldest_datname)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                                errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"",
+                                                               oldest_datname),
+                                                errhint("Execute a database-wide VACUUM in that database.\n"
+                                                                "You might also need to commit or roll back old prepared transactions.")));
+                       else
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                                errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u",
+                                                               oldest_datoid),
+                                                errhint("Execute a database-wide VACUUM in that database.\n"
+                                                                "You might also need to commit or roll back old prepared transactions.")));
+               }
+               else if (!MultiXactIdPrecedes(result, multiWarnLimit))
+               {
+                       char       *oldest_datname = get_database_name(oldest_datoid);
+
+                       /* complain even if that DB has disappeared */
+                       if (oldest_datname)
+                               ereport(WARNING,
+                                               (errmsg("database \"%s\" must be vacuumed before %u more MultiXactIds are used",
+                                                               oldest_datname,
+                                                               multiWrapLimit - result),
+                                                errhint("Execute a database-wide VACUUM in that database.\n"
+                                                                "You might also need to commit or roll back old prepared transactions.")));
+                       else
+                               ereport(WARNING,
+                                               (errmsg("database with OID %u must be vacuumed before %u more MultiXactIds are used",
+                                                               oldest_datoid,
+                                                               multiWrapLimit - result),
+                                                errhint("Execute a database-wide VACUUM in that database.\n"
+                                                                "You might also need to commit or roll back old prepared transactions.")));
+               }
+
+               /* Re-acquire lock and start over */
+               LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+               result = MultiXactState->nextMXact;
+               if (result < FirstMultiXactId)
+                       result = FirstMultiXactId;
+       }
+
+       /* Make sure there is room for the MXID in the file.  */
        ExtendMultiXactOffset(result);
 
        /*
@@ -848,12 +973,12 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
        if (nextOffset == 0)
        {
                *offset = 1;
-               nxids++;                                /* allocate member slot 0 too */
+               nmembers++;                             /* allocate member slot 0 too */
        }
        else
                *offset = nextOffset;
 
-       ExtendMultiXactMember(nextOffset, nxids);
+       ExtendMultiXactMember(nextOffset, nmembers);
 
        /*
         * Critical section from here until caller has written the data into the
@@ -870,13 +995,14 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
         *
         * We don't care about MultiXactId wraparound here; it will be handled by
         * the next iteration.  But note that nextMXact may be InvalidMultiXactId
-        * after this routine exits, so anyone else looking at the variable must
-        * be prepared to deal with that.  Similarly, nextOffset may be zero, but
-        * we won't use that as the actual start offset of the next multixact.
+        * or the first value on a segment-beginning page after this routine exits,
+        * so anyone else looking at the variable must be prepared to deal with
+        * either case.  Similarly, nextOffset may be zero, but we won't use that
+        * as the actual start offset of the next multixact.
         */
        (MultiXactState->nextMXact)++;
 
-       MultiXactState->nextOffset += nxids;
+       MultiXactState->nextOffset += nmembers;
 
        LWLockRelease(MultiXactGenLock);
 
@@ -886,14 +1012,23 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
 
 /*
  * GetMultiXactIdMembers
- *             Returns the set of TransactionIds that make up a MultiXactId
+ *             Returns the set of MultiXactMembers that make up a MultiXactId
+ *
+ * If the given MultiXactId is older than the value we know to be oldest, we
+ * return -1.  The caller is expected to allow that only in permissible cases,
+ * i.e. when the infomask lets it presuppose that the tuple had been
+ * share-locked before a pg_upgrade; this means that the HEAP_XMAX_LOCK_ONLY
+ * needs to be set, but HEAP_XMAX_KEYSHR_LOCK and HEAP_XMAX_EXCL_LOCK are not
+ * set.
  *
- * We return -1 if the MultiXactId is too old to possibly have any members
- * still running; in that case we have not actually looked them up, and
- * *xids is not set.
+ * Other border conditions, such as trying to read a value that's larger than
+ * the value currently known as the next to assign, raise an error.  Previously
+ * these also returned -1, but since this can lead to the wrong visibility
+ * results, it is dangerous to do that.
  */
 int
-GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
+GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
+                                         bool allow_old)
 {
        int                     pageno;
        int                     prev_pageno;
@@ -904,21 +1039,22 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
        int                     length;
        int                     truelength;
        int                     i;
+       MultiXactId oldestMXact;
        MultiXactId nextMXact;
        MultiXactId tmpMXact;
        MultiXactOffset nextOffset;
-       TransactionId *ptr;
+       MultiXactMember *ptr;
 
        debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
 
        Assert(MultiXactIdIsValid(multi));
 
        /* See if the MultiXactId is in the local cache */
-       length = mXactCacheGetById(multi, xids);
+       length = mXactCacheGetById(multi, members);
        if (length >= 0)
        {
                debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
-                                       mxid_to_string(multi, length, *xids));
+                                       mxid_to_string(multi, length, *members));
                return length;
        }
 
@@ -928,43 +1064,48 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
        /*
         * We check known limits on MultiXact before resorting to the SLRU area.
         *
-        * An ID older than our OldestVisibleMXactId[] entry can't possibly still
-        * be running, and we'd run the risk of trying to read already-truncated
-        * SLRU data if we did try to examine it.
+        * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
+        * useful; it should have already been frozen by vacuum.  We've truncated
+        * the on-disk structures anyway.  Returning the wrong values could lead to
+        * an incorrect visibility result.  However, to support pg_upgrade we need
+        * to allow an empty set to be returned regardless, if the caller is
+        * willing to accept it; the caller is expected to check that it's an
+        * allowed condition (such as ensuring that the infomask bits set on the
+        * tuple are consistent with the pg_upgrade scenario).  If the caller is
+        * expecting this to be called only on recently created multis, then we
+        * raise an error.
         *
         * Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is
-        * seen, it implies undetected ID wraparound has occurred.      We just
-        * silently assume that such an ID is no longer running.
+        * seen, it implies undetected ID wraparound has occurred.      This raises
+        * a hard error.
         *
         * Shared lock is enough here since we aren't modifying any global state.
-        * Also, we can examine our own OldestVisibleMXactId without the lock,
-        * since no one else is allowed to change it.
-        */
-       if (MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId]))
-       {
-               debug_elog2(DEBUG2, "GetMembers: it's too old");
-               *xids = NULL;
-               return -1;
-       }
-
-       /*
-        * Acquire the shared lock just long enough to grab the current counter
-        * values.      We may need both nextMXact and nextOffset; see below.
+        * Acquire it just long enough to grab the current counter values.      We may
+        * need both nextMXact and nextOffset; see below.
         */
        LWLockAcquire(MultiXactGenLock, LW_SHARED);
 
+       oldestMXact = MultiXactState->oldestMultiXactId;
        nextMXact = MultiXactState->nextMXact;
        nextOffset = MultiXactState->nextOffset;
 
        LWLockRelease(MultiXactGenLock);
 
-       if (!MultiXactIdPrecedes(multi, nextMXact))
+       if (MultiXactIdPrecedes(multi, oldestMXact))
        {
-               debug_elog2(DEBUG2, "GetMembers: it's too new!");
-               *xids = NULL;
+               ereport(allow_old ? DEBUG1 : ERROR,
+                               (errcode(ERRCODE_INTERNAL_ERROR),
+                                errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
+                                               multi)));
                return -1;
        }
 
+       if (!MultiXactIdPrecedes(multi, nextMXact))
+               ereport(ERROR,
+                               (errcode(ERRCODE_INTERNAL_ERROR),
+                                errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
+                                               multi)));
+
        /*
         * Find out the offset at which we need to start reading MultiXactMembers
         * and the number of members in the multixact.  We determine the latter as
@@ -1055,8 +1196,8 @@ retry:
 
        LWLockRelease(MultiXactOffsetControlLock);
 
-       ptr = (TransactionId *) palloc(length * sizeof(TransactionId));
-       *xids = ptr;
+       ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
+       *members = ptr;
 
        /* Now get the members themselves. */
        LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
@@ -1066,9 +1207,13 @@ retry:
        for (i = 0; i < length; i++, offset++)
        {
                TransactionId *xactptr;
+               uint32     *flagsptr;
+               int                     flagsoff;
+               int                     bshift;
+               int                     memberoff;
 
                pageno = MXOffsetToMemberPage(offset);
-               entryno = MXOffsetToMemberEntry(offset);
+               memberoff = MXOffsetToMemberOffset(offset);
 
                if (pageno != prev_pageno)
                {
@@ -1077,8 +1222,7 @@ retry:
                }
 
                xactptr = (TransactionId *)
-                       MultiXactMemberCtl->shared->page_buffer[slotno];
-               xactptr += entryno;
+                       (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
 
                if (!TransactionIdIsValid(*xactptr))
                {
@@ -1087,7 +1231,13 @@ retry:
                        continue;
                }
 
-               ptr[truelength++] = *xactptr;
+               flagsoff = MXOffsetToFlagsOffset(offset);
+               bshift = MXOffsetToFlagsBitShift(offset);
+               flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+
+               ptr[truelength].xid = *xactptr;
+               ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
+               truelength++;
        }
 
        LWLockRelease(MultiXactMemberControlLock);
@@ -1102,6 +1252,30 @@ retry:
        return truelength;
 }
 
+/*
+ * mxactMemberComparator
+ *             qsort comparison function for MultiXactMember
+ *
+ * We can't use wraparound comparison for XIDs because that does not respect
+ * the triangle inequality!  Any old sort order will do.
+ */
+static int
+mxactMemberComparator(const void *arg1, const void *arg2)
+{
+       MultiXactMember member1 = *(const MultiXactMember *) arg1;
+       MultiXactMember member2 = *(const MultiXactMember *) arg2;
+
+       if (member1.xid > member2.xid)
+               return 1;
+       if (member1.xid < member2.xid)
+               return -1;
+       if (member1.status > member2.status)
+               return 1;
+       if (member1.status < member2.status)
+               return -1;
+       return 0;
+}
+
 /*
  * mXactCacheGetBySet
  *             returns a MultiXactId from the cache based on the set of
@@ -1113,26 +1287,29 @@ retry:
  * for the majority of tuples, thus keeping MultiXactId usage low (saving
  * both I/O and wraparound issues).
  *
- * NB: the passed xids[] array will be sorted in-place.
+ * NB: the passed members array will be sorted in-place.
  */
 static MultiXactId
-mXactCacheGetBySet(int nxids, TransactionId *xids)
+mXactCacheGetBySet(int nmembers, MultiXactMember *members)
 {
        mXactCacheEnt *entry;
 
        debug_elog3(DEBUG2, "CacheGet: looking for %s",
-                               mxid_to_string(InvalidMultiXactId, nxids, xids));
+                               mxid_to_string(InvalidMultiXactId, nmembers, members));
 
        /* sort the array so comparison is easy */
-       qsort(xids, nxids, sizeof(TransactionId), xidComparator);
+       qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
 
        for (entry = MXactCache; entry != NULL; entry = entry->next)
        {
-               if (entry->nxids != nxids)
+               if (entry->nmembers != nmembers)
                        continue;
 
-               /* We assume the cache entries are sorted */
-               if (memcmp(xids, entry->xids, nxids * sizeof(TransactionId)) == 0)
+               /*
+                * We assume the cache entries are sorted, and that the unused bits in
+                * "status" are zeroed.
+                */
+               if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
                {
                        debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
                        return entry->multi;
@@ -1145,14 +1322,14 @@ mXactCacheGetBySet(int nxids, TransactionId *xids)
 
 /*
  * mXactCacheGetById
- *             returns the composing TransactionId set from the cache for a
+ *             returns the composing MultiXactMember set from the cache for a
  *             given MultiXactId, if present.
  *
  * If successful, *xids is set to the address of a palloc'd copy of the
- * TransactionId set.  Return value is number of members, or -1 on failure.
+ * MultiXactMember set.  Return value is number of members, or -1 on failure.
  */
 static int
-mXactCacheGetById(MultiXactId multi, TransactionId **xids)
+mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
 {
        mXactCacheEnt *entry;
 
@@ -1162,18 +1339,18 @@ mXactCacheGetById(MultiXactId multi, TransactionId **xids)
        {
                if (entry->multi == multi)
                {
-                       TransactionId *ptr;
+                       MultiXactMember *ptr;
                        Size            size;
 
-                       size = sizeof(TransactionId) * entry->nxids;
-                       ptr = (TransactionId *) palloc(size);
-                       *xids = ptr;
+                       size = sizeof(MultiXactMember) * entry->nmembers;
+                       ptr = (MultiXactMember *) palloc(size);
+                       *members = ptr;
 
-                       memcpy(ptr, entry->xids, size);
+                       memcpy(ptr, entry->members, size);
 
                        debug_elog3(DEBUG2, "CacheGet: found %s",
-                                               mxid_to_string(multi, entry->nxids, entry->xids));
-                       return entry->nxids;
+                                               mxid_to_string(multi, entry->nmembers, entry->members));
+                       return entry->nmembers;
                }
        }
 
@@ -1186,12 +1363,12 @@ mXactCacheGetById(MultiXactId multi, TransactionId **xids)
  *             Add a new MultiXactId and its composing set into the local cache.
  */
 static void
-mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids)
+mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
 {
        mXactCacheEnt *entry;
 
        debug_elog3(DEBUG2, "CachePut: storing %s",
-                               mxid_to_string(multi, nxids, xids));
+                               mxid_to_string(multi, nmembers, members));
 
        if (MXactContext == NULL)
        {
@@ -1206,36 +1383,67 @@ mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids)
 
        entry = (mXactCacheEnt *)
                MemoryContextAlloc(MXactContext,
-                                                  offsetof(mXactCacheEnt, xids) +
-                                                  nxids * sizeof(TransactionId));
+                                                  offsetof(mXactCacheEnt, members) +
+                                                  nmembers * sizeof(MultiXactMember));
 
        entry->multi = multi;
-       entry->nxids = nxids;
-       memcpy(entry->xids, xids, nxids * sizeof(TransactionId));
+       entry->nmembers = nmembers;
+       memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
 
        /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
-       qsort(entry->xids, nxids, sizeof(TransactionId), xidComparator);
+       qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
 
        entry->next = MXactCache;
        MXactCache = entry;
 }
 
-#ifdef MULTIXACT_DEBUG
 static char *
-mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids)
+mxstatus_to_string(MultiXactStatus status)
+{
+       switch (status)
+       {
+               case MultiXactStatusForKeyShare:
+                       return "keysh";
+               case MultiXactStatusForShare:
+                       return "sh";
+               case MultiXactStatusForNoKeyUpdate:
+                       return "fornokeyupd";
+               case MultiXactStatusForUpdate:
+                       return "forupd";
+               case MultiXactStatusNoKeyUpdate:
+                       return "nokeyupd";
+               case MultiXactStatusUpdate:
+                       return "upd";
+               default:
+                       elog(ERROR, "unrecognized multixact status %d", status);
+                       return "";
+       }
+}
+
+char *
+mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
 {
-       char       *str = palloc(15 * (nxids + 1) + 4);
+       static char        *str = NULL;
+       StringInfoData  buf;
        int                     i;
 
-       snprintf(str, 47, "%u %d[%u", multi, nxids, xids[0]);
+       if (str != NULL)
+               pfree(str);
 
-       for (i = 1; i < nxids; i++)
-               snprintf(str + strlen(str), 17, ", %u", xids[i]);
+       initStringInfo(&buf);
 
-       strcat(str, "]");
+       appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
+                                        mxstatus_to_string(members[0].status));
+
+       for (i = 1; i < nmembers; i++)
+               appendStringInfo(&buf, ", %u (%s)", members[i].xid,
+                                                mxstatus_to_string(members[i].status));
+
+       appendStringInfoChar(&buf, ']');
+       str = MemoryContextStrdup(TopMemoryContext, buf.data);
+       pfree(buf.data);
        return str;
 }
-#endif
 
 /*
  * AtEOXact_MultiXact
@@ -1512,8 +1720,9 @@ ZeroMultiXactMemberPage(int pageno, bool writeXlog)
  * This must be called ONCE during postmaster or standalone-backend startup.
  *
  * StartupXLOG has already established nextMXact/nextOffset by calling
- * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact.     Note that we
- * may already have replayed WAL data into the SLRU files.
+ * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
+ * info from pg_control and/or MultiXactAdvanceOldest.  Note that we may
+ * already have replayed WAL data into the SLRU files.
  *
  * We don't need any locks here, really; the SLRU locks are taken
  * only because slru.c expects to be called with locks held.
@@ -1525,6 +1734,7 @@ StartupMultiXact(void)
        MultiXactOffset offset = MultiXactState->nextOffset;
        int                     pageno;
        int                     entryno;
+       int                     flagsoff;
 
        /* Clean up offsets state */
        LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
@@ -1569,28 +1779,30 @@ StartupMultiXact(void)
         * Zero out the remainder of the current members page.  See notes in
         * TrimCLOG() for motivation.
         */
-       entryno = MXOffsetToMemberEntry(offset);
-       if (entryno != 0)
+       flagsoff = MXOffsetToFlagsOffset(offset);
+       if (flagsoff != 0)
        {
                int                     slotno;
                TransactionId *xidptr;
+               int                     memberoff;
 
+               memberoff = MXOffsetToMemberOffset(offset);
                slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
-               xidptr = (TransactionId *) MultiXactMemberCtl->shared->page_buffer[slotno];
-               xidptr += entryno;
+               xidptr = (TransactionId *)
+                       (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
 
-               MemSet(xidptr, 0, BLCKSZ - (entryno * sizeof(TransactionId)));
+               MemSet(xidptr, 0, BLCKSZ - memberoff);
+
+               /*
+                * Note: we don't need to zero out the flag bits in the remaining
+                * members of the current group, because they are always reset before
+                * writing.
+                */
 
                MultiXactMemberCtl->shared->page_dirty[slotno] = true;
        }
 
        LWLockRelease(MultiXactMemberControlLock);
-
-       /*
-        * Initialize lastTruncationPoint to invalid, ensuring that the first
-        * checkpoint will try to do truncation.
-        */
-       MultiXactState->lastTruncationPoint = InvalidMultiXactId;
 }
 
 /*
@@ -1607,22 +1819,25 @@ ShutdownMultiXact(void)
 }
 
 /*
- * Get the next MultiXactId and offset to save in a checkpoint record
+ * Get the MultiXact data to save in a checkpoint record
  */
 void
 MultiXactGetCheckptMulti(bool is_shutdown,
                                                 MultiXactId *nextMulti,
-                                                MultiXactOffset *nextMultiOffset)
+                                                MultiXactOffset *nextMultiOffset,
+                                                MultiXactId *oldestMulti,
+                                                Oid *oldestMultiDB)
 {
        LWLockAcquire(MultiXactGenLock, LW_SHARED);
-
        *nextMulti = MultiXactState->nextMXact;
        *nextMultiOffset = MultiXactState->nextOffset;
-
+       *oldestMulti = MultiXactState->oldestMultiXactId;
+       *oldestMultiDB = MultiXactState->oldestMultiXactDB;
        LWLockRelease(MultiXactGenLock);
 
-       debug_elog4(DEBUG2, "MultiXact: checkpoint is nextMulti %u, nextOffset %u",
-                               *nextMulti, *nextMultiOffset);
+       debug_elog6(DEBUG2,
+                               "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
+                               *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
 }
 
 /*
@@ -1637,17 +1852,6 @@ CheckPointMultiXact(void)
        SimpleLruFlush(MultiXactOffsetCtl, true);
        SimpleLruFlush(MultiXactMemberCtl, true);
 
-       /*
-        * Truncate the SLRU files.  This could be done at any time, but
-        * checkpoint seems a reasonable place for it.  There is one exception: if
-        * we are called during xlog recovery, then shared->latest_page_number
-        * isn't valid (because StartupMultiXact hasn't been called yet) and so
-        * SimpleLruTruncate would get confused.  It seems best not to risk
-        * removing any data during recovery anyway, so don't truncate.
-        */
-       if (!RecoveryInProgress())
-               TruncateMultiXact();
-
        TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
 }
 
@@ -1671,9 +1875,129 @@ MultiXactSetNextMXact(MultiXactId nextMulti,
        LWLockRelease(MultiXactGenLock);
 }
 
+/*
+ * Determine the last safe MultiXactId to allocate given the currently oldest
+ * datminmxid (ie, the oldest MultiXactId that might exist in any database
+ * of our cluster), and the OID of the (or a) database with that value.
+ */
+void
+SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
+{
+       MultiXactId     multiVacLimit;
+       MultiXactId     multiWarnLimit;
+       MultiXactId     multiStopLimit;
+       MultiXactId     multiWrapLimit;
+       MultiXactId     curMulti;
+
+       Assert(MultiXactIdIsValid(oldest_datminmxid));
+
+       /*
+        * The place where we actually get into deep trouble is halfway around
+        * from the oldest potentially-existing XID/multi.  (This calculation is
+        * probably off by one or two counts for Xids, because the special XIDs
+        * reduce the size of the loop a little bit.  But we throw in plenty of
+        * slop below, so it doesn't matter.)
+        */
+       multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
+       if (multiWrapLimit < FirstMultiXactId)
+               multiWrapLimit += FirstMultiXactId;
+
+       /*
+        * We'll refuse to continue assigning MultiXactIds once we get within 100
+        * multi of data loss.
+        */
+       multiStopLimit = multiWrapLimit - 100;
+       if (multiStopLimit < FirstMultiXactId)
+               multiStopLimit -= FirstMultiXactId;
+
+       /*
+        * We'll start complaining loudly when we get within 10M multis of the stop
+        * point.       This is kind of arbitrary, but if you let your gas gauge get
+        * down to 1% of full, would you be looking for the next gas station?  We
+        * need to be fairly liberal about this number because there are lots of
+        * scenarios where most transactions are done by automatic clients that
+        * won't pay attention to warnings. (No, we're not gonna make this
+        * configurable.  If you know enough to configure it, you know enough to
+        * not get in this kind of trouble in the first place.)
+        */
+       multiWarnLimit = multiStopLimit - 10000000;
+       if (multiWarnLimit < FirstMultiXactId)
+               multiWarnLimit -= FirstMultiXactId;
+
+       /*
+        * We'll start trying to force autovacuums when oldest_datminmxid gets
+        * to be more than 200 million transactions old.
+        */
+       multiVacLimit = oldest_datminmxid + 200000000;
+       if (multiVacLimit < FirstMultiXactId)
+               multiVacLimit += FirstMultiXactId;
+
+       /* Grab lock for just long enough to set the new limit values */
+       LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+       MultiXactState->oldestMultiXactId = oldest_datminmxid;
+       MultiXactState->oldestMultiXactDB = oldest_datoid;
+       MultiXactState->multiVacLimit = multiVacLimit;
+       MultiXactState->multiWarnLimit = multiWarnLimit;
+       MultiXactState->multiStopLimit = multiStopLimit;
+       MultiXactState->multiWrapLimit = multiWrapLimit;
+       curMulti = MultiXactState->nextMXact;
+       LWLockRelease(MultiXactGenLock);
+
+       /* Log the info */
+       ereport(DEBUG1,
+                       (errmsg("MultiXactId wrap limit is %u, limited by database with OID %u",
+                                       multiWrapLimit, oldest_datoid)));
+
+       /*
+        * If past the autovacuum force point, immediately signal an autovac
+        * request.  The reason for this is that autovac only processes one
+        * database per invocation.  Once it's finished cleaning up the oldest
+        * database, it'll call here, and we'll signal the postmaster to start
+        * another iteration immediately if there are still any old databases.
+        */
+       if (MultiXactIdPrecedes(multiVacLimit, curMulti) &&
+               IsUnderPostmaster && !InRecovery)
+               SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+       /* Give an immediate warning if past the wrap warn point */
+       if (MultiXactIdPrecedes(multiWarnLimit, curMulti) && !InRecovery)
+       {
+               char       *oldest_datname;
+
+               /*
+                * We can be called when not inside a transaction, for example during
+                * StartupXLOG().  In such a case we cannot do database access, so we
+                * must just report the oldest DB's OID.
+                *
+                * Note: it's also possible that get_database_name fails and returns
+                * NULL, for example because the database just got dropped.  We'll
+                * still warn, even though the warning might now be unnecessary.
+                */
+               if (IsTransactionState())
+                       oldest_datname = get_database_name(oldest_datoid);
+               else
+                       oldest_datname = NULL;
+
+               if (oldest_datname)
+                       ereport(WARNING,
+                                       (errmsg("database \"%s\" must be vacuumed before %u more MultiXactId are used",
+                                                       oldest_datname,
+                                                       multiWrapLimit - curMulti),
+                                        errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+                                                        "You might also need to commit or roll back old prepared transactions.")));
+               else
+                       ereport(WARNING,
+                                       (errmsg("database with OID %u must be vacuumed before %u more MultiXactId are used",
+                                                       oldest_datoid,
+                                                       multiWrapLimit - curMulti),
+                                        errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+                                                        "You might also need to commit or roll back old prepared transactions.")));
+       }
+}
+
 /*
  * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
- * and similarly nextOffset is at least minMultiOffset
+ * and similarly nextOffset is at least minMultiOffset.
  *
  * This is used when we can determine minimum safe values from an XLog
  * record (either an on-line checkpoint or an mxact creation log entry).
@@ -1699,6 +2023,17 @@ MultiXactAdvanceNextMXact(MultiXactId minMulti,
        LWLockRelease(MultiXactGenLock);
 }
 
+/*
+ * Update our oldestMultiXactId value, but only if it's more recent than
+ * what we had.
+ */
+void
+MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
+{
+       if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
+               SetMultiXactIdLimit(oldestMulti, oldestMultiDB);
+}
+
 /*
  * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
  *
@@ -1748,13 +2083,16 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
         */
        while (nmembers > 0)
        {
-               int                     entryno;
+               int                     flagsoff;
+               int                     flagsbit;
+               int                     difference;
 
                /*
                 * Only zero when at first entry of a page.
                 */
-               entryno = MXOffsetToMemberEntry(offset);
-               if (entryno == 0)
+               flagsoff = MXOffsetToFlagsOffset(offset);
+               flagsbit = MXOffsetToFlagsBitShift(offset);
+               if (flagsoff == 0 && flagsbit == 0)
                {
                        int                     pageno;
 
@@ -1769,33 +2107,32 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
                }
 
                /* Advance to next page (OK if nmembers goes negative) */
-               offset += (MULTIXACT_MEMBERS_PER_PAGE - entryno);
-               nmembers -= (MULTIXACT_MEMBERS_PER_PAGE - entryno);
+               difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
+               offset += difference;
+               nmembers -= difference;
        }
 }
 
 /*
- * Remove all MultiXactOffset and MultiXactMember segments before the oldest
- * ones still of interest.
+ * GetOldestMultiXactId
  *
- * This is called only during checkpoints.     We assume no more than one
- * backend does this at a time.
+ * Return the oldest MultiXactId that's still possibly still seen as live by
+ * any running transaction.  Older ones might still exist on disk, but they no
+ * longer have any running member transaction.
  *
- * XXX do we have any issues with needing to checkpoint here?
+ * It's not safe to truncate MultiXact SLRU segments on the value returned by
+ * this function; however, it can be used by a full-table vacuum to set the
+ * point at which it will be possible to truncate SLRU for that table.
  */
-static void
-TruncateMultiXact(void)
+MultiXactId
+GetOldestMultiXactId(void)
 {
-       MultiXactId nextMXact;
-       MultiXactOffset nextOffset;
-       MultiXactId oldestMXact;
-       MultiXactOffset oldestOffset;
-       int                     cutoffPage;
-       int                     i;
+       MultiXactId             oldestMXact;
+       MultiXactId             nextMXact;
+       int                             i;
 
        /*
-        * First, compute where we can safely truncate.  Per notes above, this is
-        * the oldest valid value among all the OldestMemberMXactId[] and
+        * This is the oldest valid value among all the OldestMemberMXactId[] and
         * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
         */
        LWLockAcquire(MultiXactGenLock, LW_SHARED);
@@ -1824,28 +2161,69 @@ TruncateMultiXact(void)
                        oldestMXact = thisoldest;
        }
 
-       /* Save the current nextOffset too */
-       nextOffset = MultiXactState->nextOffset;
-
        LWLockRelease(MultiXactGenLock);
 
-       debug_elog3(DEBUG2, "MultiXact: truncation point = %u", oldestMXact);
+       return oldestMXact;
+}
+
+typedef struct mxtruncinfo
+{
+       int             earliestExistingPage;
+} mxtruncinfo;
+
+/*
+ * SlruScanDirectory callback
+ *             This callback determines the earliest existing page number.
+ */
+static bool
+SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data)
+{
+       mxtruncinfo             *trunc = (mxtruncinfo *) data;
+
+       if (trunc->earliestExistingPage == -1 ||
+               ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
+       {
+               trunc->earliestExistingPage = segpage;
+       }
+
+       return false;   /* keep going */
+}
+
+/*
+ * Remove all MultiXactOffset and MultiXactMember segments before the oldest
+ * ones still of interest.
+ *
+ * This is called by vacuum after it has successfully advanced a database's
+ * datminmxid value; the cutoff value we're passed is the minimum of all
+ * databases' datminmxid values.
+ */
+void
+TruncateMultiXact(MultiXactId oldestMXact)
+{
+       MultiXactOffset oldestOffset;
+       mxtruncinfo             trunc;
+       MultiXactId             earliest;
 
        /*
-        * If we already truncated at this point, do nothing.  This saves time
-        * when no MultiXacts are getting used, which is probably not uncommon.
+        * Note we can't just plow ahead with the truncation; it's possible that
+        * there are no segments to truncate, which is a problem because we are
+        * going to attempt to read the offsets page to determine where to truncate
+        * the members SLRU.  So we first scan the directory to determine the
+        * earliest offsets page number that we can read without error.
         */
-       if (MultiXactState->lastTruncationPoint == oldestMXact)
+       trunc.earliestExistingPage = -1;
+       SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
+       earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
+
+       /* nothing to do */
+       if (MultiXactIdPrecedes(oldestMXact, earliest))
                return;
 
        /*
-        * We need to determine where to truncate MultiXactMember.      If we found a
-        * valid oldest MultiXactId, read its starting offset; otherwise we use
-        * the nextOffset value we saved above.
+        * First, compute the safe truncation point for MultiXactMember.
+        * This is the starting offset of the multixact we were passed
+        * as MultiXactOffset cutoff.
         */
-       if (oldestMXact == nextMXact)
-               oldestOffset = nextOffset;
-       else
        {
                int                     pageno;
                int                     slotno;
@@ -1857,34 +2235,23 @@ TruncateMultiXact(void)
                pageno = MultiXactIdToOffsetPage(oldestMXact);
                entryno = MultiXactIdToOffsetEntry(oldestMXact);
 
-               slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, oldestMXact);
-               offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+               slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno,
+                                                                                       oldestMXact);
+               offptr = (MultiXactOffset *)
+                       MultiXactOffsetCtl->shared->page_buffer[slotno];
                offptr += entryno;
                oldestOffset = *offptr;
 
                LWLockRelease(MultiXactOffsetControlLock);
        }
 
-       /*
-        * The cutoff point is the start of the segment containing oldestMXact. We
-        * pass the *page* containing oldestMXact to SimpleLruTruncate.
-        */
-       cutoffPage = MultiXactIdToOffsetPage(oldestMXact);
-
-       SimpleLruTruncate(MultiXactOffsetCtl, cutoffPage);
-
-       /*
-        * Also truncate MultiXactMember at the previously determined offset.
-        */
-       cutoffPage = MXOffsetToMemberPage(oldestOffset);
+       /* truncate MultiXactOffset */
+       SimpleLruTruncate(MultiXactOffsetCtl,
+                                         MultiXactIdToOffsetPage(oldestMXact));
 
-       SimpleLruTruncate(MultiXactMemberCtl, cutoffPage);
-
-       /*
-        * Set the last known truncation point.  We don't need a lock for this
-        * since only one backend does checkpoints at a time.
-        */
-       MultiXactState->lastTruncationPoint = oldestMXact;
+       /* truncate MultiXactMembers and we're done */
+       SimpleLruTruncate(MultiXactMemberCtl,
+                                         MXOffsetToMemberPage(oldestOffset));
 }
 
 /*
@@ -1934,7 +2301,7 @@ MultiXactMemberPagePrecedes(int page1, int page2)
  * XXX do we need to do something special for InvalidMultiXactId?
  * (Doesn't look like it.)
  */
-static bool
+bool
 MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
 {
        int32           diff = (int32) (multi1 - multi2);
@@ -1953,7 +2320,6 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
        return (diff < 0);
 }
 
-
 /*
  * Write an xlog record reflecting the zeroing of either a MEMBERs or
  * OFFSETs page (info shows which)
@@ -2013,16 +2379,18 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
        }
        else if (info == XLOG_MULTIXACT_CREATE_ID)
        {
-               xl_multixact_create *xlrec = (xl_multixact_create *) XLogRecGetData(record);
-               TransactionId *xids = xlrec->xids;
+               xl_multixact_create *xlrec =
+                       (xl_multixact_create *) XLogRecGetData(record);
                TransactionId max_xid;
                int                     i;
 
                /* Store the data back into the SLRU files */
-               RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nxids, xids);
+               RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
+                                                  xlrec->members);
 
                /* Make sure nextMXact/nextOffset are beyond what this record has */
-               MultiXactAdvanceNextMXact(xlrec->mid + 1, xlrec->moff + xlrec->nxids);
+               MultiXactAdvanceNextMXact(xlrec->mid + 1,
+                                                                 xlrec->moff + xlrec->nmembers);
 
                /*
                 * Make sure nextXid is beyond any XID mentioned in the record. This
@@ -2030,10 +2398,10 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
                 * evidence in the XLOG, but let's be safe.
                 */
                max_xid = record->xl_xid;
-               for (i = 0; i < xlrec->nxids; i++)
+               for (i = 0; i < xlrec->nmembers; i++)
                {
-                       if (TransactionIdPrecedes(max_xid, xids[i]))
-                               max_xid = xids[i];
+                       if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
+                               max_xid = xlrec->members[i].xid;
                }
 
                /*
@@ -2053,3 +2421,72 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
        else
                elog(PANIC, "multixact_redo: unknown op code %u", info);
 }
+
+Datum
+pg_get_multixact_members(PG_FUNCTION_ARGS)
+{
+       typedef struct
+       {
+               MultiXactMember *members;
+               int                             nmembers;
+               int                             iter;
+       } mxact;
+       MultiXactId             mxid = PG_GETARG_UINT32(0);
+       mxact              *multi;
+       FuncCallContext *funccxt;
+
+       if (mxid < FirstMultiXactId)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                errmsg("invalid MultiXactId: %u", mxid)));
+
+       if (SRF_IS_FIRSTCALL())
+       {
+               MemoryContext oldcxt;
+               TupleDesc       tupdesc;
+
+               funccxt = SRF_FIRSTCALL_INIT();
+               oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx);
+
+               multi = palloc(sizeof(mxact));
+               /* no need to allow for old values here */
+               multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false);
+               multi->iter = 0;
+
+               tupdesc = CreateTemplateTupleDesc(2, false);
+               TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid",
+                                                  XIDOID, -1, 0);
+               TupleDescInitEntry(tupdesc, (AttrNumber) 2, "mode",
+                                                  TEXTOID, -1, 0);
+
+               funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc);
+               funccxt->user_fctx = multi;
+
+               MemoryContextSwitchTo(oldcxt);
+       }
+
+       funccxt = SRF_PERCALL_SETUP();
+       multi = (mxact *) funccxt->user_fctx;
+
+       while (multi->iter < multi->nmembers)
+       {
+               HeapTuple       tuple;
+               char       *values[2];
+
+               values[0] = palloc(32);
+               sprintf(values[0], "%u", multi->members[multi->iter].xid);
+               values[1] = mxstatus_to_string(multi->members[multi->iter].status);
+
+               tuple = BuildTupleFromCStrings(funccxt->attinmeta, values);
+
+               multi->iter++;
+               pfree(values[0]);
+               SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple));
+       }
+
+       if (multi->nmembers > 0)
+               pfree(multi->members);
+       pfree(multi);
+
+       SRF_RETURN_DONE(funccxt);
+}
index f041e4b2c0be97a1aed9209c52e3956f4d842161..64537d0128051f933f6521d7429032f5fc07eb83 100644 (file)
@@ -75,6 +75,8 @@ GetNewTransactionId(bool isSubXact)
         * If we're past xidStopLimit, refuse to execute transactions, unless
         * we are running in a standalone backend (which gives an escape hatch
         * to the DBA who somehow got past the earlier defenses).
+        *
+        * Note that this coding also appears in GetNewMultiXactId.
         *----------
         */
        if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidVacLimit))
index d316c97926553588bf05716d3ef59d170786b211..cf2f6e70cff9e5fa0bf608183c22746d360e45ad 100644 (file)
@@ -3899,6 +3899,8 @@ BootStrapXLOG(void)
        checkPoint.nextMultiOffset = 0;
        checkPoint.oldestXid = FirstNormalTransactionId;
        checkPoint.oldestXidDB = TemplateDbOid;
+       checkPoint.oldestMulti = FirstMultiXactId;
+       checkPoint.oldestMultiDB = TemplateDbOid;
        checkPoint.time = (pg_time_t) time(NULL);
        checkPoint.oldestActiveXid = InvalidTransactionId;
 
@@ -3907,6 +3909,7 @@ BootStrapXLOG(void)
        ShmemVariableCache->oidCount = 0;
        MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
        SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+       SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
 
        /* Set up the XLOG page header */
        page->xlp_magic = XLOG_PAGE_MAGIC;
@@ -4979,6 +4982,9 @@ StartupXLOG(void)
        ereport(DEBUG1,
                        (errmsg("oldest unfrozen transaction ID: %u, in database %u",
                                        checkPoint.oldestXid, checkPoint.oldestXidDB)));
+       ereport(DEBUG1,
+                       (errmsg("oldest MultiXactId: %u, in database %u",
+                                       checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
        if (!TransactionIdIsNormal(checkPoint.nextXid))
                ereport(PANIC,
                                (errmsg("invalid next transaction ID")));
@@ -4989,6 +4995,7 @@ StartupXLOG(void)
        ShmemVariableCache->oidCount = 0;
        MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
        SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+       SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
        XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
        XLogCtl->ckptXid = checkPoint.nextXid;
 
@@ -6724,7 +6731,9 @@ CreateCheckPoint(int flags)
 
        MultiXactGetCheckptMulti(shutdown,
                                                         &checkPoint.nextMulti,
-                                                        &checkPoint.nextMultiOffset);
+                                                        &checkPoint.nextMultiOffset,
+                                                        &checkPoint.oldestMulti,
+                                                        &checkPoint.oldestMultiDB);
 
        /*
         * Having constructed the checkpoint record, ensure all shmem disk buffers
@@ -7479,6 +7488,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                MultiXactSetNextMXact(checkPoint.nextMulti,
                                                          checkPoint.nextMultiOffset);
                SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+               SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
 
                /*
                 * If we see a shutdown checkpoint while waiting for an end-of-backup
@@ -7577,6 +7587,8 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                                                                  checkPoint.oldestXid))
                        SetTransactionIdLimit(checkPoint.oldestXid,
                                                                  checkPoint.oldestXidDB);
+               MultiXactAdvanceOldest(checkPoint.oldestMulti,
+                                                          checkPoint.oldestMultiDB);
 
                /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
                ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
index 263205855b21f48fe7f8581a27a46fb3ba4f0336..db51e0b6084584b9479291c6a196f41298e260c8 100644 (file)
@@ -30,6 +30,7 @@
 #include "postgres.h"
 
 #include "access/htup_details.h"
+#include "access/multixact.h"
 #include "access/sysattr.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -779,6 +780,7 @@ InsertPgClassTuple(Relation pg_class_desc,
        values[Anum_pg_class_relhastriggers - 1] = BoolGetDatum(rd_rel->relhastriggers);
        values[Anum_pg_class_relhassubclass - 1] = BoolGetDatum(rd_rel->relhassubclass);
        values[Anum_pg_class_relfrozenxid - 1] = TransactionIdGetDatum(rd_rel->relfrozenxid);
+       values[Anum_pg_class_relminmxid - 1] = MultiXactIdGetDatum(rd_rel->relminmxid);
        if (relacl != (Datum) 0)
                values[Anum_pg_class_relacl - 1] = relacl;
        else
@@ -854,7 +856,7 @@ AddNewRelationTuple(Relation pg_class_desc,
                        break;
        }
 
-       /* Initialize relfrozenxid */
+       /* Initialize relfrozenxid and relminmxid */
        if (relkind == RELKIND_RELATION ||
                relkind == RELKIND_TOASTVALUE)
        {
@@ -864,6 +866,15 @@ AddNewRelationTuple(Relation pg_class_desc,
                 * that will do.
                 */
                new_rel_reltup->relfrozenxid = RecentXmin;
+               /*
+                * Similarly, initialize the minimum Multixact to the first value that
+                * could possibly be stored in tuples in the table.  Running
+                * transactions could reuse values from their local cache, so we are
+                * careful to consider all currently running multis.
+                *
+                * XXX this could be refined further, but is it worth the hassle?
+                */
+               new_rel_reltup->relminmxid = GetOldestMultiXactId();
        }
        else
        {
@@ -874,6 +885,7 @@ AddNewRelationTuple(Relation pg_class_desc,
                 * commands/sequence.c.)
                 */
                new_rel_reltup->relfrozenxid = InvalidTransactionId;
+               new_rel_reltup->relfrozenxid = InvalidMultiXactId;
        }
 
        new_rel_reltup->relowner = relowner;
index 5892e44667776acbe642fb3d3679105b6fec08cd..9b339292e4966b50d89dcef4ef8d2c37167f1e7f 100644 (file)
@@ -23,6 +23,7 @@
 
 #include <unistd.h>
 
+#include "access/multixact.h"
 #include "access/relscan.h"
 #include "access/sysattr.h"
 #include "access/transam.h"
@@ -2353,8 +2354,7 @@ IndexBuildHeapScan(Relation heapRelation,
                                         * As with INSERT_IN_PROGRESS case, this is unexpected
                                         * unless it's our own deletion or a system catalog.
                                         */
-                                       Assert(!(heapTuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
-                                       xwait = HeapTupleHeaderGetXmax(heapTuple->t_data);
+                                       xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
                                        if (!TransactionIdIsCurrentTransactionId(xwait))
                                        {
                                                if (!is_system_catalog)
@@ -3184,7 +3184,8 @@ reindex_index(Oid indexId, bool skip_constraint_checks)
                }
 
                /* We'll build a new physical relation for the index */
-               RelationSetNewRelfilenode(iRel, InvalidTransactionId);
+               RelationSetNewRelfilenode(iRel, InvalidTransactionId,
+                                                                 InvalidMultiXactId);
 
                /* Initialize the index and rebuild */
                /* Note: we do not need to re-establish pkey setting */
@@ -3364,7 +3365,7 @@ reindex_relation(Oid relid, int flags)
 
        /* Ensure rd_indexattr is valid; see comments for RelationSetIndexList */
        if (is_pg_class)
-               (void) RelationGetIndexAttrBitmap(rel);
+               (void) RelationGetIndexAttrBitmap(rel, false);
 
        PG_TRY();
        {
index 7a5eb42424bc40b3e2aceb2f7b7421cd38c66470..d7b17a5aba6cf1d3b7f0ac1ffee3a729793f246d 100644 (file)
@@ -16,6 +16,7 @@
 
 #include <math.h>
 
+#include "access/multixact.h"
 #include "access/transam.h"
 #include "access/tupconvert.h"
 #include "access/tuptoaster.h"
@@ -580,7 +581,8 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
                                                        totalrows,
                                                        visibilitymap_count(onerel),
                                                        hasindex,
-                                                       InvalidTransactionId);
+                                                       InvalidTransactionId,
+                                                       InvalidMultiXactId);
 
        /*
         * Same for indexes. Vacuum always scans all indexes, so if we're part of
@@ -600,7 +602,8 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
                                                                totalindexrows,
                                                                0,
                                                                false,
-                                                               InvalidTransactionId);
+                                                               InvalidTransactionId,
+                                                               InvalidMultiXactId);
                }
        }
 
@@ -1193,7 +1196,7 @@ acquire_sample_rows(Relation onerel, int elevel,
                                         * right.  (Note: this works out properly when the row was
                                         * both inserted and deleted in our xact.)
                                         */
-                                       if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(targtuple.t_data)))
+                                       if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple.t_data)))
                                                deadrows += 1;
                                        else
                                                liverows += 1;
index 238781b6a70f267bf52bd662858b46897f552c09..c0cb2f665457b81136a02029b28db3299514a730 100644 (file)
@@ -17,6 +17,7 @@
  */
 #include "postgres.h"
 
+#include "access/multixact.h"
 #include "access/relscan.h"
 #include "access/rewriteheap.h"
 #include "access/transam.h"
@@ -65,7 +66,8 @@ static void rebuild_relation(Relation OldHeap, Oid indexOid,
                                 int freeze_min_age, int freeze_table_age, bool verbose);
 static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
                           int freeze_min_age, int freeze_table_age, bool verbose,
-                          bool *pSwapToastByContent, TransactionId *pFreezeXid);
+                          bool *pSwapToastByContent, TransactionId *pFreezeXid,
+                          MultiXactId *pFreezeMulti);
 static List *get_tables_to_cluster(MemoryContext cluster_context);
 static void reform_and_rewrite_tuple(HeapTuple tuple,
                                                 TupleDesc oldTupDesc, TupleDesc newTupDesc,
@@ -549,6 +551,7 @@ rebuild_relation(Relation OldHeap, Oid indexOid,
        bool            is_system_catalog;
        bool            swap_toast_by_content;
        TransactionId frozenXid;
+       MultiXactId     frozenMulti;
 
        /* Mark the correct index as clustered */
        if (OidIsValid(indexOid))
@@ -566,14 +569,14 @@ rebuild_relation(Relation OldHeap, Oid indexOid,
        /* Copy the heap data into the new table in the desired order */
        copy_heap_data(OIDNewHeap, tableOid, indexOid,
                                   freeze_min_age, freeze_table_age, verbose,
-                                  &swap_toast_by_content, &frozenXid);
+                                  &swap_toast_by_content, &frozenXid, &frozenMulti);
 
        /*
         * Swap the physical files of the target and transient tables, then
         * rebuild the target's indexes and throw away the transient table.
         */
        finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
-                                        swap_toast_by_content, false, frozenXid);
+                                        swap_toast_by_content, false, frozenXid, frozenMulti);
 }
 
 
@@ -706,7 +709,8 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace)
 static void
 copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
                           int freeze_min_age, int freeze_table_age, bool verbose,
-                          bool *pSwapToastByContent, TransactionId *pFreezeXid)
+                          bool *pSwapToastByContent, TransactionId *pFreezeXid,
+                          MultiXactId *pFreezeMulti)
 {
        Relation        NewHeap,
                                OldHeap,
@@ -722,6 +726,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
        bool            is_system_catalog;
        TransactionId OldestXmin;
        TransactionId FreezeXid;
+       MultiXactId     MultiXactFrzLimit;
        RewriteState rwstate;
        bool            use_sort;
        Tuplesortstate *tuplesort;
@@ -822,7 +827,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
         */
        vacuum_set_xid_limits(freeze_min_age, freeze_table_age,
                                                  OldHeap->rd_rel->relisshared,
-                                                 &OldestXmin, &FreezeXid, NULL);
+                                                 &OldestXmin, &FreezeXid, NULL, &MultiXactFrzLimit);
 
        /*
         * FreezeXid will become the table's new relfrozenxid, and that mustn't go
@@ -831,14 +836,16 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
        if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
                FreezeXid = OldHeap->rd_rel->relfrozenxid;
 
-       /* return selected value to caller */
+       /* return selected values to caller */
        *pFreezeXid = FreezeXid;
+       *pFreezeMulti = MultiXactFrzLimit;
 
        /* Remember if it's a system catalog */
        is_system_catalog = IsSystemRelation(OldHeap);
 
        /* Initialize the rewrite operation */
-       rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, use_wal);
+       rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid,
+                                                                MultiXactFrzLimit, use_wal);
 
        /*
         * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
@@ -966,9 +973,8 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
                                /*
                                 * Similar situation to INSERT_IN_PROGRESS case.
                                 */
-                               Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
                                if (!is_system_catalog &&
-                                       !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple->t_data)))
+                                       !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
                                        elog(WARNING, "concurrent delete in progress within table \"%s\"",
                                                 RelationGetRelationName(OldHeap));
                                /* treat as recently dead */
@@ -1097,6 +1103,7 @@ static void
 swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
                                        bool swap_toast_by_content,
                                        TransactionId frozenXid,
+                                       MultiXactId frozenMulti,
                                        Oid *mapped_tables)
 {
        Relation        relRelation;
@@ -1204,11 +1211,13 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
         * and then fail to commit the pg_class update.
         */
 
-       /* set rel1's frozen Xid */
+       /* set rel1's frozen Xid and minimum MultiXid */
        if (relform1->relkind != RELKIND_INDEX)
        {
                Assert(TransactionIdIsNormal(frozenXid));
                relform1->relfrozenxid = frozenXid;
+               Assert(MultiXactIdIsValid(frozenMulti));
+               relform1->relminmxid = frozenMulti;
        }
 
        /* swap size statistics too, since new rel has freshly-updated stats */
@@ -1272,6 +1281,7 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
                                                                        target_is_pg_class,
                                                                        swap_toast_by_content,
                                                                        frozenXid,
+                                                                       frozenMulti,
                                                                        mapped_tables);
                        }
                        else
@@ -1361,6 +1371,7 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
                                                        target_is_pg_class,
                                                        swap_toast_by_content,
                                                        InvalidTransactionId,
+                                                       InvalidMultiXactId,
                                                        mapped_tables);
 
        /* Clean up. */
@@ -1398,7 +1409,8 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
                                 bool is_system_catalog,
                                 bool swap_toast_by_content,
                                 bool check_constraints,
-                                TransactionId frozenXid)
+                                TransactionId frozenXid,
+                                MultiXactId frozenMulti)
 {
        ObjectAddress object;
        Oid                     mapped_tables[4];
@@ -1414,7 +1426,8 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
         */
        swap_relation_files(OIDOldHeap, OIDNewHeap,
                                                (OIDOldHeap == RelationRelationId),
-                                               swap_toast_by_content, frozenXid, mapped_tables);
+                                               swap_toast_by_content, frozenXid, frozenMulti,
+                                               mapped_tables);
 
        /*
         * If it's a system catalog, queue an sinval message to flush all
index 4ad4b9975850d47fcda51836bccf2b4299044487..5b06af24a6cfbea7538da47b50db13dac3a39e88 100644 (file)
@@ -80,6 +80,7 @@ static bool get_db_info(const char *name, LOCKMODE lockmode,
                        Oid *dbIdP, Oid *ownerIdP,
                        int *encodingP, bool *dbIsTemplateP, bool *dbAllowConnP,
                        Oid *dbLastSysOidP, TransactionId *dbFrozenXidP,
+                       MultiXactId *dbMinMultiP,
                        Oid *dbTablespace, char **dbCollate, char **dbCtype);
 static bool have_createdb_privilege(void);
 static void remove_dbtablespaces(Oid db_id);
@@ -104,6 +105,7 @@ createdb(const CreatedbStmt *stmt)
        bool            src_allowconn;
        Oid                     src_lastsysoid;
        TransactionId src_frozenxid;
+       MultiXactId src_minmxid;
        Oid                     src_deftablespace;
        volatile Oid dst_deftablespace;
        Relation        pg_database_rel;
@@ -288,7 +290,7 @@ createdb(const CreatedbStmt *stmt)
        if (!get_db_info(dbtemplate, ShareLock,
                                         &src_dboid, &src_owner, &src_encoding,
                                         &src_istemplate, &src_allowconn, &src_lastsysoid,
-                                        &src_frozenxid, &src_deftablespace,
+                                        &src_frozenxid, &src_minmxid, &src_deftablespace,
                                         &src_collate, &src_ctype))
                ereport(ERROR,
                                (errcode(ERRCODE_UNDEFINED_DATABASE),
@@ -491,6 +493,7 @@ createdb(const CreatedbStmt *stmt)
        new_record[Anum_pg_database_datconnlimit - 1] = Int32GetDatum(dbconnlimit);
        new_record[Anum_pg_database_datlastsysoid - 1] = ObjectIdGetDatum(src_lastsysoid);
        new_record[Anum_pg_database_datfrozenxid - 1] = TransactionIdGetDatum(src_frozenxid);
+       new_record[Anum_pg_database_datminmxid - 1] = TransactionIdGetDatum(src_minmxid);
        new_record[Anum_pg_database_dattablespace - 1] = ObjectIdGetDatum(dst_deftablespace);
 
        /*
@@ -786,7 +789,7 @@ dropdb(const char *dbname, bool missing_ok)
        pgdbrel = heap_open(DatabaseRelationId, RowExclusiveLock);
 
        if (!get_db_info(dbname, AccessExclusiveLock, &db_id, NULL, NULL,
-                                        &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL))
+                                        &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL, NULL))
        {
                if (!missing_ok)
                {
@@ -945,7 +948,7 @@ RenameDatabase(const char *oldname, const char *newname)
        rel = heap_open(DatabaseRelationId, RowExclusiveLock);
 
        if (!get_db_info(oldname, AccessExclusiveLock, &db_id, NULL, NULL,
-                                        NULL, NULL, NULL, NULL, NULL, NULL, NULL))
+                                        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL))
                ereport(ERROR,
                                (errcode(ERRCODE_UNDEFINED_DATABASE),
                                 errmsg("database \"%s\" does not exist", oldname)));
@@ -1046,7 +1049,7 @@ movedb(const char *dbname, const char *tblspcname)
        pgdbrel = heap_open(DatabaseRelationId, RowExclusiveLock);
 
        if (!get_db_info(dbname, AccessExclusiveLock, &db_id, NULL, NULL,
-                                        NULL, NULL, NULL, NULL, &src_tblspcoid, NULL, NULL))
+                                        NULL, NULL, NULL, NULL, NULL, &src_tblspcoid, NULL, NULL))
                ereport(ERROR,
                                (errcode(ERRCODE_UNDEFINED_DATABASE),
                                 errmsg("database \"%s\" does not exist", dbname)));
@@ -1599,6 +1602,7 @@ get_db_info(const char *name, LOCKMODE lockmode,
                        Oid *dbIdP, Oid *ownerIdP,
                        int *encodingP, bool *dbIsTemplateP, bool *dbAllowConnP,
                        Oid *dbLastSysOidP, TransactionId *dbFrozenXidP,
+                       MultiXactId *dbMinMultiP,
                        Oid *dbTablespace, char **dbCollate, char **dbCtype)
 {
        bool            result = false;
@@ -1685,6 +1689,9 @@ get_db_info(const char *name, LOCKMODE lockmode,
                                /* limit of frozen XIDs */
                                if (dbFrozenXidP)
                                        *dbFrozenXidP = dbform->datfrozenxid;
+                               /* limit of frozen Multixacts */
+                               if (dbMinMultiP)
+                                       *dbMinMultiP = dbform->datminmxid;
                                /* default tablespace for this database */
                                if (dbTablespace)
                                        *dbTablespace = dbform->dattablespace;
index 1f2546d69ca8ea487365ff4923d07438a8cfbd8f..de41c8a1c71500d3f430110037d161432cf78802 100644 (file)
@@ -14,8 +14,9 @@
  */
 #include "postgres.h"
 
-#include "access/transam.h"
 #include "access/htup_details.h"
+#include "access/multixact.h"
+#include "access/transam.h"
 #include "access/xlogutils.h"
 #include "catalog/dependency.h"
 #include "catalog/namespace.h"
@@ -282,8 +283,10 @@ ResetSequence(Oid seq_relid)
        /*
         * Create a new storage file for the sequence.  We want to keep the
         * sequence's relfrozenxid at 0, since it won't contain any unfrozen XIDs.
+        * Same with relminmxid, since a sequence will never contain multixacts.
         */
-       RelationSetNewRelfilenode(seq_rel, InvalidTransactionId);
+       RelationSetNewRelfilenode(seq_rel, InvalidTransactionId,
+                                                         InvalidMultiXactId);
 
        /*
         * Insert the modified tuple into the new storage file.
@@ -1110,7 +1113,8 @@ read_seq_tuple(SeqTable elm, Relation rel, Buffer *buf, HeapTuple seqtuple)
         * bit update, ie, don't bother to WAL-log it, since we can certainly do
         * this again if the update gets lost.
         */
-       if (HeapTupleHeaderGetXmax(seqtuple->t_data) != InvalidTransactionId)
+       Assert(!(seqtuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
+       if (HeapTupleHeaderGetRawXmax(seqtuple->t_data) != InvalidTransactionId)
        {
                HeapTupleHeaderSetXmax(seqtuple->t_data, InvalidTransactionId);
                seqtuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED;
index cad83117f95e8f6e8e48f1c0645b32036758cc3c..6bc056bbc332a1ccec1ee101865114949f133d19 100644 (file)
@@ -15,7 +15,9 @@
 #include "postgres.h"
 
 #include "access/genam.h"
+#include "access/heapam.h"
 #include "access/heapam_xlog.h"
+#include "access/multixact.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
 #include "access/sysattr.h"
@@ -1130,6 +1132,7 @@ ExecuteTruncate(TruncateStmt *stmt)
                {
                        Oid                     heap_relid;
                        Oid                     toast_relid;
+                       MultiXactId     minmulti;
 
                        /*
                         * This effectively deletes all rows in the table, and may be done
@@ -1139,6 +1142,8 @@ ExecuteTruncate(TruncateStmt *stmt)
                         */
                        CheckTableForSerializableConflictIn(rel);
 
+                       minmulti = GetOldestMultiXactId();
+
                        /*
                         * Need the full transaction-safe pushups.
                         *
@@ -1146,7 +1151,7 @@ ExecuteTruncate(TruncateStmt *stmt)
                         * as the relfilenode value. The old storage file is scheduled for
                         * deletion at commit.
                         */
-                       RelationSetNewRelfilenode(rel, RecentXmin);
+                       RelationSetNewRelfilenode(rel, RecentXmin, minmulti);
                        if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
                                heap_create_init_fork(rel);
 
@@ -1159,7 +1164,7 @@ ExecuteTruncate(TruncateStmt *stmt)
                        if (OidIsValid(toast_relid))
                        {
                                rel = relation_open(toast_relid, AccessExclusiveLock);
-                               RelationSetNewRelfilenode(rel, RecentXmin);
+                               RelationSetNewRelfilenode(rel, RecentXmin, minmulti);
                                if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
                                        heap_create_init_fork(rel);
                                heap_close(rel, NoLock);
@@ -3516,7 +3521,8 @@ ATRewriteTables(List **wqueue, LOCKMODE lockmode)
                         * interest in letting this code work on system catalogs.
                         */
                        finish_heap_swap(tab->relid, OIDNewHeap,
-                                                        false, false, true, RecentXmin);
+                                                        false, false, true, RecentXmin,
+                                                        ReadNextMultiXactId());
                }
                else
                {
index a719cf24f4393ff5e79ed1a1615860ed306685b8..f11a8ec5d421d3dbe6b04d04b965fa10515ec77d 100644 (file)
@@ -73,6 +73,7 @@ static HeapTuple GetTupleForTrigger(EState *estate,
                                   EPQState *epqstate,
                                   ResultRelInfo *relinfo,
                                   ItemPointer tid,
+                                  LockTupleMode lockmode,
                                   TupleTableSlot **newSlot);
 static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo,
                           Trigger *trigger, TriggerEvent event,
@@ -2147,7 +2148,7 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
        int                     i;
 
        trigtuple = GetTupleForTrigger(estate, epqstate, relinfo, tupleid,
-                                                                  &newSlot);
+                                                                  LockTupleExclusive, &newSlot);
        if (trigtuple == NULL)
                return false;
 
@@ -2201,7 +2202,8 @@ ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo,
        if (trigdesc && trigdesc->trig_delete_after_row)
        {
                HeapTuple       trigtuple = GetTupleForTrigger(estate, NULL, relinfo,
-                                                                                                  tupleid, NULL);
+                                                                                                  tupleid, LockTupleExclusive,
+                                                                                                  NULL);
 
                AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_DELETE,
                                                          true, trigtuple, NULL, NIL, NULL);
@@ -2332,10 +2334,24 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
        TupleTableSlot *newSlot;
        int                     i;
        Bitmapset  *modifiedCols;
+       Bitmapset  *keyCols;
+       LockTupleMode lockmode;
+
+       /*
+        * Compute lock mode to use.  If columns that are part of the key have not
+        * been modified, then we can use a weaker lock, allowing for better
+        * concurrency.
+        */
+       modifiedCols = GetModifiedColumns(relinfo, estate);
+       keyCols = RelationGetIndexAttrBitmap(relinfo->ri_RelationDesc, true);
+       if (bms_overlap(keyCols, modifiedCols))
+               lockmode = LockTupleExclusive;
+       else
+               lockmode = LockTupleNoKeyExclusive;
 
        /* get a copy of the on-disk tuple we are planning to update */
        trigtuple = GetTupleForTrigger(estate, epqstate, relinfo, tupleid,
-                                                                  &newSlot);
+                                                                  lockmode, &newSlot);
        if (trigtuple == NULL)
                return NULL;                    /* cancel the update action */
 
@@ -2357,7 +2373,6 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
                newtuple = slottuple;
        }
 
-       modifiedCols = GetModifiedColumns(relinfo, estate);
 
        LocTriggerData.type = T_TriggerData;
        LocTriggerData.tg_event = TRIGGER_EVENT_UPDATE |
@@ -2426,7 +2441,8 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo,
        if (trigdesc && trigdesc->trig_update_after_row)
        {
                HeapTuple       trigtuple = GetTupleForTrigger(estate, NULL, relinfo,
-                                                                                                  tupleid, NULL);
+                                                                                                  tupleid, LockTupleExclusive,
+                                                                                                  NULL);
 
                AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_UPDATE,
                                                          true, trigtuple, newtuple, recheckIndexes,
@@ -2565,6 +2581,7 @@ GetTupleForTrigger(EState *estate,
                                   EPQState *epqstate,
                                   ResultRelInfo *relinfo,
                                   ItemPointer tid,
+                                  LockTupleMode lockmode,
                                   TupleTableSlot **newSlot)
 {
        Relation        relation = relinfo->ri_RelationDesc;
@@ -2589,8 +2606,8 @@ ltrmark:;
                tuple.t_self = *tid;
                test = heap_lock_tuple(relation, &tuple,
                                                           estate->es_output_cid,
-                                                          LockTupleExclusive, false /* wait */,
-                                                          &buffer, &hufd);
+                                                          lockmode, false /* wait */,
+                                                          false, &buffer, &hufd);
                switch (test)
                {
                        case HeapTupleSelfUpdated:
@@ -2630,6 +2647,7 @@ ltrmark:;
                                                                                   epqstate,
                                                                                   relation,
                                                                                   relinfo->ri_RangeTableIndex,
+                                                                                  lockmode,
                                                                                   &hufd.ctid,
                                                                                   hufd.xmax);
                                        if (!TupIsNull(epqslot))
index 2d3170a2504af005a01feab9f59e855a6cde7773..a37a54e5b42fddbbbb19d5f6a584562047714150 100644 (file)
@@ -26,6 +26,7 @@
 #include "access/genam.h"
 #include "access/heapam.h"
 #include "access/htup_details.h"
+#include "access/multixact.h"
 #include "access/transam.h"
 #include "access/xact.h"
 #include "catalog/namespace.h"
@@ -63,7 +64,7 @@ static BufferAccessStrategy vac_strategy;
 
 /* non-export function prototypes */
 static List *get_rel_oids(Oid relid, const RangeVar *vacrel);
-static void vac_truncate_clog(TransactionId frozenXID);
+static void vac_truncate_clog(TransactionId frozenXID, MultiXactId frozenMulti);
 static bool vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast,
                   bool for_wraparound);
 
@@ -379,7 +380,8 @@ vacuum_set_xid_limits(int freeze_min_age,
                                          bool sharedRel,
                                          TransactionId *oldestXmin,
                                          TransactionId *freezeLimit,
-                                         TransactionId *freezeTableLimit)
+                                         TransactionId *freezeTableLimit,
+                                         MultiXactId *multiXactFrzLimit)
 {
        int                     freezemin;
        TransactionId limit;
@@ -463,8 +465,22 @@ vacuum_set_xid_limits(int freeze_min_age,
 
                *freezeTableLimit = limit;
        }
-}
 
+       if (multiXactFrzLimit != NULL)
+       {
+               MultiXactId     mxLimit;
+
+               /*
+                * simplistic multixactid freezing: use the same freezing policy as
+                * for Xids
+                */
+               mxLimit = GetOldestMultiXactId() - freezemin;
+               if (mxLimit < FirstMultiXactId)
+                       mxLimit = FirstMultiXactId;
+
+               *multiXactFrzLimit = mxLimit;
+       }
+}
 
 /*
  * vac_estimate_reltuples() -- estimate the new value for pg_class.reltuples
@@ -574,7 +590,8 @@ void
 vac_update_relstats(Relation relation,
                                        BlockNumber num_pages, double num_tuples,
                                        BlockNumber num_all_visible_pages,
-                                       bool hasindex, TransactionId frozenxid)
+                                       bool hasindex, TransactionId frozenxid,
+                                       MultiXactId minmulti)
 {
        Oid                     relid = RelationGetRelid(relation);
        Relation        rd;
@@ -648,6 +665,14 @@ vac_update_relstats(Relation relation,
                dirty = true;
        }
 
+       /* relminmxid must never go backward, either */
+       if (MultiXactIdIsValid(minmulti) &&
+               MultiXactIdPrecedes(pgcform->relminmxid, minmulti))
+       {
+               pgcform->relminmxid = minmulti;
+               dirty = true;
+       }
+
        /* If anything changed, write out the tuple. */
        if (dirty)
                heap_inplace_update(rd, ctup);
@@ -660,8 +685,13 @@ vac_update_relstats(Relation relation,
  *     vac_update_datfrozenxid() -- update pg_database.datfrozenxid for our DB
  *
  *             Update pg_database's datfrozenxid entry for our database to be the
- *             minimum of the pg_class.relfrozenxid values.  If we are able to
- *             advance pg_database.datfrozenxid, also try to truncate pg_clog.
+ *             minimum of the pg_class.relfrozenxid values.
+ *
+ *             Similarly, update our datfrozenmulti to be the minimum of the
+ *             pg_class.relfrozenmulti values.
+ *
+ *             If we are able to advance either pg_database value, also try to
+ *             truncate pg_clog and pg_multixact.
  *
  *             We violate transaction semantics here by overwriting the database's
  *             existing pg_database tuple with the new value.  This is reasonably
@@ -678,16 +708,23 @@ vac_update_datfrozenxid(void)
        SysScanDesc scan;
        HeapTuple       classTup;
        TransactionId newFrozenXid;
+       MultiXactId     newFrozenMulti;
        bool            dirty = false;
 
        /*
         * Initialize the "min" calculation with GetOldestXmin, which is a
         * reasonable approximation to the minimum relfrozenxid for not-yet-
         * committed pg_class entries for new tables; see AddNewRelationTuple().
-        * Se we cannot produce a wrong minimum by starting with this.
+        * So we cannot produce a wrong minimum by starting with this.
         */
        newFrozenXid = GetOldestXmin(true, true);
 
+       /*
+        * Similarly, initialize the MultiXact "min" with the value that would
+        * be used on pg_class for new tables.  See AddNewRelationTuple().
+        */
+       newFrozenMulti = GetOldestMultiXactId();
+
        /*
         * We must seqscan pg_class to find the minimum Xid, because there is no
         * index that can help us here.
@@ -710,9 +747,13 @@ vac_update_datfrozenxid(void)
                        continue;
 
                Assert(TransactionIdIsNormal(classForm->relfrozenxid));
+               Assert(MultiXactIdIsValid(classForm->relminmxid));
 
                if (TransactionIdPrecedes(classForm->relfrozenxid, newFrozenXid))
                        newFrozenXid = classForm->relfrozenxid;
+
+               if (MultiXactIdPrecedes(classForm->relminmxid, newFrozenMulti))
+                       newFrozenMulti = classForm->relminmxid;
        }
 
        /* we're done with pg_class */
@@ -720,6 +761,7 @@ vac_update_datfrozenxid(void)
        heap_close(relation, AccessShareLock);
 
        Assert(TransactionIdIsNormal(newFrozenXid));
+       Assert(MultiXactIdIsValid(newFrozenMulti));
 
        /* Now fetch the pg_database tuple we need to update. */
        relation = heap_open(DatabaseRelationId, RowExclusiveLock);
@@ -740,6 +782,13 @@ vac_update_datfrozenxid(void)
                dirty = true;
        }
 
+       /* ditto */
+       if (MultiXactIdPrecedes(dbform->datminmxid, newFrozenMulti))
+       {
+               dbform->datminmxid = newFrozenMulti;
+               dirty = true;
+       }
+
        if (dirty)
                heap_inplace_update(relation, tuple);
 
@@ -752,7 +801,7 @@ vac_update_datfrozenxid(void)
         * this action will update that too.
         */
        if (dirty || ForceTransactionIdLimitUpdate())
-               vac_truncate_clog(newFrozenXid);
+               vac_truncate_clog(newFrozenXid, newFrozenMulti);
 }
 
 
@@ -771,17 +820,19 @@ vac_update_datfrozenxid(void)
  *             info is stale.
  */
 static void
-vac_truncate_clog(TransactionId frozenXID)
+vac_truncate_clog(TransactionId frozenXID, MultiXactId frozenMulti)
 {
        TransactionId myXID = GetCurrentTransactionId();
        Relation        relation;
        HeapScanDesc scan;
        HeapTuple       tuple;
-       Oid                     oldest_datoid;
+       Oid                     oldestxid_datoid;
+       Oid                     oldestmulti_datoid;
        bool            frozenAlreadyWrapped = false;
 
-       /* init oldest_datoid to sync with my frozenXID */
-       oldest_datoid = MyDatabaseId;
+       /* init oldest datoids to sync with my frozen values */
+       oldestxid_datoid = MyDatabaseId;
+       oldestmulti_datoid = MyDatabaseId;
 
        /*
         * Scan pg_database to compute the minimum datfrozenxid
@@ -804,13 +855,20 @@ vac_truncate_clog(TransactionId frozenXID)
                Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);
 
                Assert(TransactionIdIsNormal(dbform->datfrozenxid));
+               Assert(MultiXactIdIsValid(dbform->datminmxid));
 
                if (TransactionIdPrecedes(myXID, dbform->datfrozenxid))
                        frozenAlreadyWrapped = true;
                else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
                {
                        frozenXID = dbform->datfrozenxid;
-                       oldest_datoid = HeapTupleGetOid(tuple);
+                       oldestxid_datoid = HeapTupleGetOid(tuple);
+               }
+
+               if (MultiXactIdPrecedes(dbform->datminmxid, frozenMulti))
+               {
+                       frozenMulti = dbform->datminmxid;
+                       oldestmulti_datoid = HeapTupleGetOid(tuple);
                }
        }
 
@@ -832,14 +890,18 @@ vac_truncate_clog(TransactionId frozenXID)
                return;
        }
 
-       /* Truncate CLOG to the oldest frozenxid */
+       /* Truncate CLOG and Multi to the oldest computed value */
        TruncateCLOG(frozenXID);
+       TruncateMultiXact(frozenMulti);
 
        /*
-        * Update the wrap limit for GetNewTransactionId.  Note: this function
-        * will also signal the postmaster for an(other) autovac cycle if needed.
+        * Update the wrap limit for GetNewTransactionId and creation of new
+        * MultiXactIds.  Note: these functions will also signal the postmaster for
+        * an(other) autovac cycle if needed.   XXX should we avoid possibly
+        * signalling twice?
         */
-       SetTransactionIdLimit(frozenXID, oldest_datoid);
+       SetTransactionIdLimit(frozenXID, oldestxid_datoid);
+       MultiXactAdvanceOldest(frozenMulti, oldestmulti_datoid);
 }
 
 
index 8eda66364b378ab24bd663cd7e914d3224887a97..5ec65ea41be4cb17a4d80e44fb52b9f315246dc2 100644 (file)
@@ -41,6 +41,7 @@
 #include "access/heapam.h"
 #include "access/heapam_xlog.h"
 #include "access/htup_details.h"
+#include "access/multixact.h"
 #include "access/transam.h"
 #include "access/visibilitymap.h"
 #include "catalog/storage.h"
@@ -124,6 +125,7 @@ static int  elevel = -1;
 
 static TransactionId OldestXmin;
 static TransactionId FreezeLimit;
+static MultiXactId MultiXactFrzLimit;
 
 static BufferAccessStrategy vac_strategy;
 
@@ -180,6 +182,7 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
        double          new_rel_tuples;
        BlockNumber new_rel_allvisible;
        TransactionId new_frozen_xid;
+       MultiXactId     new_min_multi;
 
        /* measure elapsed time iff autovacuum logging requires it */
        if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
@@ -197,7 +200,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 
        vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
                                                  onerel->rd_rel->relisshared,
-                                                 &OldestXmin, &FreezeLimit, &freezeTableLimit);
+                                                 &OldestXmin, &FreezeLimit, &freezeTableLimit,
+                                                 &MultiXactFrzLimit);
        scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
                                                                                         freezeTableLimit);
 
@@ -267,12 +271,17 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
        if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
                new_frozen_xid = InvalidTransactionId;
 
+       new_min_multi = MultiXactFrzLimit;
+       if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
+               new_min_multi = InvalidMultiXactId;
+
        vac_update_relstats(onerel,
                                                new_rel_pages,
                                                new_rel_tuples,
                                                new_rel_allvisible,
                                                vacrelstats->hasindex,
-                                               new_frozen_xid);
+                                               new_frozen_xid,
+                                               new_min_multi);
 
        /*
         * Report results to the stats collector, too. An early terminated
@@ -839,7 +848,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                                 * Each non-removable tuple must be checked to see if it needs
                                 * freezing.  Note we already have exclusive buffer lock.
                                 */
-                               if (heap_freeze_tuple(tuple.t_data, FreezeLimit))
+                               if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
+                                                                         MultiXactFrzLimit))
                                        frozen[nfrozen++] = offnum;
                        }
                }                                               /* scan along page */
@@ -857,7 +867,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                                XLogRecPtr      recptr;
 
                                recptr = log_heap_freeze(onerel, buf, FreezeLimit,
-                                                                                frozen, nfrozen);
+                                                                                MultiXactFrzLimit, frozen, nfrozen);
                                PageSetLSN(page, recptr);
                                PageSetTLI(page, ThisTimeLineID);
                        }
@@ -1176,7 +1186,8 @@ lazy_check_needs_freeze(Buffer buf)
 
                tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
 
-               if (heap_tuple_needs_freeze(tupleheader, FreezeLimit, buf))
+               if (heap_tuple_needs_freeze(tupleheader, FreezeLimit,
+                                                                       MultiXactFrzLimit, buf))
                        return true;
        }                                                       /* scan along page */
 
@@ -1253,7 +1264,8 @@ lazy_cleanup_index(Relation indrel,
                                                        stats->num_index_tuples,
                                                        0,
                                                        false,
-                                                       InvalidTransactionId);
+                                                       InvalidTransactionId,
+                                                       InvalidMultiXactId);
 
        ereport(elevel,
                        (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
index 9d5d829406e6f4e50ed1464f4b71debdc07b3610..23a6a612565a92e7f977e5baa31593d008a0e9f3 100644 (file)
@@ -162,7 +162,8 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
                case CMD_SELECT:
 
                        /*
-                        * SELECT FOR UPDATE/SHARE and modifying CTEs need to mark tuples
+                        * SELECT FOR [KEY] UPDATE/SHARE and modifying CTEs need to mark
+                        * tuples
                         */
                        if (queryDesc->plannedstmt->rowMarks != NIL ||
                                queryDesc->plannedstmt->hasModifyingCTE)
@@ -775,7 +776,7 @@ InitPlan(QueryDesc *queryDesc, int eflags)
        }
 
        /*
-        * Similarly, we have to lock relations selected FOR UPDATE/FOR SHARE
+        * Similarly, we have to lock relations selected FOR [KEY] UPDATE/SHARE
         * before we initialize the plan tree, else we'd be risking lock upgrades.
         * While we are at it, build the ExecRowMark list.
         */
@@ -794,7 +795,9 @@ InitPlan(QueryDesc *queryDesc, int eflags)
                switch (rc->markType)
                {
                        case ROW_MARK_EXCLUSIVE:
+                       case ROW_MARK_NOKEYEXCLUSIVE:
                        case ROW_MARK_SHARE:
+                       case ROW_MARK_KEYSHARE:
                                relid = getrelid(rc->rti, rangeTable);
                                relation = heap_open(relid, RowShareLock);
                                break;
@@ -1341,7 +1344,7 @@ ExecEndPlan(PlanState *planstate, EState *estate)
        }
 
        /*
-        * close any relations selected FOR UPDATE/FOR SHARE, again keeping locks
+        * close any relations selected FOR [KEY] UPDATE/SHARE, again keeping locks
         */
        foreach(l, estate->es_rowMarks)
        {
@@ -1694,6 +1697,7 @@ ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist)
  *     epqstate - state for EvalPlanQual rechecking
  *     relation - table containing tuple
  *     rti - rangetable index of table containing tuple
+ *     lockmode - requested tuple lock mode
  *     *tid - t_ctid from the outdated tuple (ie, next updated version)
  *     priorXmax - t_xmax from the outdated tuple
  *
@@ -1702,10 +1706,13 @@ ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist)
  *
  * Returns a slot containing the new candidate update/delete tuple, or
  * NULL if we determine we shouldn't process the row.
+ *
+ * Note: properly, lockmode should be declared as enum LockTupleMode,
+ * but we use "int" to avoid having to include heapam.h in executor.h.
  */
 TupleTableSlot *
 EvalPlanQual(EState *estate, EPQState *epqstate,
-                        Relation relation, Index rti,
+                        Relation relation, Index rti, int lockmode,
                         ItemPointer tid, TransactionId priorXmax)
 {
        TupleTableSlot *slot;
@@ -1716,7 +1723,7 @@ EvalPlanQual(EState *estate, EPQState *epqstate,
        /*
         * Get and lock the updated version of the row; if fail, return NULL.
         */
-       copyTuple = EvalPlanQualFetch(estate, relation, LockTupleExclusive,
+       copyTuple = EvalPlanQualFetch(estate, relation, lockmode,
                                                                  tid, priorXmax);
 
        if (copyTuple == NULL)
@@ -1864,7 +1871,7 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode,
                        test = heap_lock_tuple(relation, &tuple,
                                                                   estate->es_output_cid,
                                                                   lockmode, false /* wait */,
-                                                                  &buffer, &hufd);
+                                                                  false, &buffer, &hufd);
                        /* We now have two pins on the buffer, get rid of one */
                        ReleaseBuffer(buffer);
 
@@ -1965,7 +1972,7 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode,
                /* updated, so look at the updated row */
                tuple.t_self = tuple.t_data->t_ctid;
                /* updated row should have xmin matching this xmax */
-               priorXmax = HeapTupleHeaderGetXmax(tuple.t_data);
+               priorXmax = HeapTupleHeaderGetUpdateXid(tuple.t_data);
                ReleaseBuffer(buffer);
                /* loop back to fetch next in chain */
        }
index 9b9d7941c8b7ea1f504f8c780ae9c8c56b5b663b..ae2d26b48b4b98c05e875cc640d17da8359726fb 100644 (file)
@@ -111,14 +111,29 @@ lnext:
                tuple.t_self = *((ItemPointer) DatumGetPointer(datum));
 
                /* okay, try to lock the tuple */
-               if (erm->markType == ROW_MARK_EXCLUSIVE)
-                       lockmode = LockTupleExclusive;
-               else
-                       lockmode = LockTupleShared;
+               switch (erm->markType)
+               {
+                       case ROW_MARK_EXCLUSIVE:
+                               lockmode = LockTupleExclusive;
+                               break;
+                       case ROW_MARK_NOKEYEXCLUSIVE:
+                               lockmode = LockTupleNoKeyExclusive;
+                               break;
+                       case ROW_MARK_SHARE:
+                               lockmode = LockTupleShare;
+                               break;
+                       case ROW_MARK_KEYSHARE:
+                               lockmode = LockTupleKeyShare;
+                               break;
+                       default:
+                               elog(ERROR, "unsupported rowmark type");
+                               lockmode = LockTupleNoKeyExclusive;     /* keep compiler quiet */
+                               break;
+               }
 
                test = heap_lock_tuple(erm->relation, &tuple,
                                                           estate->es_output_cid,
-                                                          lockmode, erm->noWait,
+                                                          lockmode, erm->noWait, true,
                                                           &buffer, &hufd);
                ReleaseBuffer(buffer);
                switch (test)
index 40cc423c76d07b3b4067aaecab41cf599dddfb79..cb084d03d47759cc10f200a71d3bb8e6f4d3a4ce 100644 (file)
@@ -403,6 +403,7 @@ ldelete:;
                                                                                   epqstate,
                                                                                   resultRelationDesc,
                                                                                   resultRelInfo->ri_RangeTableIndex,
+                                                                                  LockTupleExclusive,
                                                                                   &hufd.ctid,
                                                                                   hufd.xmax);
                                        if (!TupIsNull(epqslot))
@@ -569,6 +570,8 @@ ExecUpdate(ItemPointer tupleid,
        }
        else
        {
+               LockTupleMode   lockmode;
+
                /*
                 * Check the constraints of the tuple
                 *
@@ -595,7 +598,7 @@ lreplace:;
                                                         estate->es_output_cid,
                                                         estate->es_crosscheck_snapshot,
                                                         true /* wait for commit */,
-                                                        &hufd);
+                                                        &hufd, &lockmode);
                switch (result)
                {
                        case HeapTupleSelfUpdated:
@@ -647,6 +650,7 @@ lreplace:;
                                                                                   epqstate,
                                                                                   resultRelationDesc,
                                                                                   resultRelInfo->ri_RangeTableIndex,
+                                                                                  lockmode,
                                                                                   &hufd.ctid,
                                                                                   hufd.xmax);
                                        if (!TupIsNull(epqslot))
index 9a01ec6d5991277edbab458ef156b84de5e174e9..2da08d1cc154018a6a2deb681bf2ec9b09aeb8c3 100644 (file)
@@ -2037,7 +2037,7 @@ _copyRowMarkClause(const RowMarkClause *from)
        RowMarkClause *newnode = makeNode(RowMarkClause);
 
        COPY_SCALAR_FIELD(rti);
-       COPY_SCALAR_FIELD(forUpdate);
+       COPY_SCALAR_FIELD(strength);
        COPY_SCALAR_FIELD(noWait);
        COPY_SCALAR_FIELD(pushedDown);
 
@@ -2400,7 +2400,7 @@ _copyLockingClause(const LockingClause *from)
        LockingClause *newnode = makeNode(LockingClause);
 
        COPY_NODE_FIELD(lockedRels);
-       COPY_SCALAR_FIELD(forUpdate);
+       COPY_SCALAR_FIELD(strength);
        COPY_SCALAR_FIELD(noWait);
 
        return newnode;
index 034159da31ddfcc9f226234d85e52c16871d89f4..9e313c8b1be18da46834a802a5f7cb18d481aeda 100644 (file)
@@ -2210,7 +2210,7 @@ static bool
 _equalLockingClause(const LockingClause *a, const LockingClause *b)
 {
        COMPARE_NODE_FIELD(lockedRels);
-       COMPARE_SCALAR_FIELD(forUpdate);
+       COMPARE_SCALAR_FIELD(strength);
        COMPARE_SCALAR_FIELD(noWait);
 
        return true;
@@ -2283,7 +2283,7 @@ static bool
 _equalRowMarkClause(const RowMarkClause *a, const RowMarkClause *b)
 {
        COMPARE_SCALAR_FIELD(rti);
-       COMPARE_SCALAR_FIELD(forUpdate);
+       COMPARE_SCALAR_FIELD(strength);
        COMPARE_SCALAR_FIELD(noWait);
        COMPARE_SCALAR_FIELD(pushedDown);
 
index 484e426489eb8d9a9b045062eb14c01d2b7f0183..ffd123d5066ad088eb8640d74890837a84428749 100644 (file)
@@ -2111,7 +2111,7 @@ _outLockingClause(StringInfo str, const LockingClause *node)
        WRITE_NODE_TYPE("LOCKINGCLAUSE");
 
        WRITE_NODE_FIELD(lockedRels);
-       WRITE_BOOL_FIELD(forUpdate);
+       WRITE_ENUM_FIELD(strength, LockClauseStrength);
        WRITE_BOOL_FIELD(noWait);
 }
 
@@ -2289,7 +2289,7 @@ _outRowMarkClause(StringInfo str, const RowMarkClause *node)
        WRITE_NODE_TYPE("ROWMARKCLAUSE");
 
        WRITE_UINT_FIELD(rti);
-       WRITE_BOOL_FIELD(forUpdate);
+       WRITE_ENUM_FIELD(strength, LockClauseStrength);
        WRITE_BOOL_FIELD(noWait);
        WRITE_BOOL_FIELD(pushedDown);
 }
index ed2354144c4bbe6eb34c20a3abe3bfad65e4ccae..472c82361ab6eabb1217f9a0cfe9402f5fec560f 100644 (file)
@@ -301,7 +301,7 @@ _readRowMarkClause(void)
        READ_LOCALS(RowMarkClause);
 
        READ_UINT_FIELD(rti);
-       READ_BOOL_FIELD(forUpdate);
+       READ_ENUM_FIELD(strength, LockClauseStrength);
        READ_BOOL_FIELD(noWait);
        READ_BOOL_FIELD(pushedDown);
 
index 774b9d627dc9b4b77991d41cde201224aa1ad72c..11d951cabeabc76f400e5fad39b7d1083d6073e2 100644 (file)
@@ -861,11 +861,11 @@ make_outerjoininfo(PlannerInfo *root,
        Assert(jointype != JOIN_RIGHT);
 
        /*
-        * Presently the executor cannot support FOR UPDATE/SHARE marking of rels
+        * Presently the executor cannot support FOR [KEY] UPDATE/SHARE marking of rels
         * appearing on the nullable side of an outer join. (It's somewhat unclear
         * what that would mean, anyway: what should we mark when a result row is
         * generated from no element of the nullable relation?)  So, complain if
-        * any nullable rel is FOR UPDATE/SHARE.
+        * any nullable rel is FOR [KEY] UPDATE/SHARE.
         *
         * You might be wondering why this test isn't made far upstream in the
         * parser.      It's because the parser hasn't got enough info --- consider
@@ -883,7 +883,7 @@ make_outerjoininfo(PlannerInfo *root,
                        (jointype == JOIN_FULL && bms_is_member(rc->rti, left_rels)))
                        ereport(ERROR,
                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                        errmsg("SELECT FOR UPDATE/SHARE cannot be applied to the nullable side of an outer join")));
+                                        errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be applied to the nullable side of an outer join")));
        }
 
        sjinfo->syn_lefthand = left_rels;
index de975d8791d6e79ed905c09bd07885e9408972ed..3e75d3994cb541f18da08cc50422018924953490 100644 (file)
@@ -562,7 +562,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
                                returningLists = NIL;
 
                        /*
-                        * If there was a FOR UPDATE/SHARE clause, the LockRows node will
+                        * If there was a FOR [KEY] UPDATE/SHARE clause, the LockRows node will
                         * have dealt with fetching non-locked marked rows, else we need
                         * to have ModifyTable do that.
                         */
@@ -954,7 +954,7 @@ inheritance_planner(PlannerInfo *root)
        root->simple_rel_array = save_rel_array;
 
        /*
-        * If there was a FOR UPDATE/SHARE clause, the LockRows node will have
+        * If there was a FOR [KEY] UPDATE/SHARE clause, the LockRows node will have
         * dealt with fetching non-locked marked rows, else we need to have
         * ModifyTable do that.
         */
@@ -1065,13 +1065,13 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
                                                                                tlist);
 
                /*
-                * Can't handle FOR UPDATE/SHARE here (parser should have checked
+                * Can't handle FOR [KEY] UPDATE/SHARE here (parser should have checked
                 * already, but let's make sure).
                 */
                if (parse->rowMarks)
                        ereport(ERROR,
                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                        errmsg("SELECT FOR UPDATE/SHARE is not allowed with UNION/INTERSECT/EXCEPT")));
+                                        errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE is not allowed with UNION/INTERSECT/EXCEPT")));
 
                /*
                 * Calculate pathkeys that represent result ordering requirements
@@ -1797,7 +1797,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
        }
 
        /*
-        * If there is a FOR UPDATE/SHARE clause, add the LockRows node. (Note: we
+        * If there is a FOR [KEY] UPDATE/SHARE clause, add the LockRows node. (Note: we
         * intentionally test parse->rowMarks not root->rowMarks here. If there
         * are only non-locking rowmarks, they should be handled by the
         * ModifyTable node instead.)
@@ -1983,7 +1983,7 @@ preprocess_rowmarks(PlannerInfo *root)
        if (parse->rowMarks)
        {
                /*
-                * We've got trouble if FOR UPDATE/SHARE appears inside grouping,
+                * We've got trouble if FOR [KEY] UPDATE/SHARE appears inside grouping,
                 * since grouping renders a reference to individual tuple CTIDs
                 * invalid.  This is also checked at parse time, but that's
                 * insufficient because of rule substitution, query pullup, etc.
@@ -1993,7 +1993,7 @@ preprocess_rowmarks(PlannerInfo *root)
        else
        {
                /*
-                * We only need rowmarks for UPDATE, DELETE, or FOR UPDATE/SHARE.
+                * We only need rowmarks for UPDATE, DELETE, or FOR [KEY] UPDATE/SHARE.
                 */
                if (parse->commandType != CMD_UPDATE &&
                        parse->commandType != CMD_DELETE)
@@ -2003,7 +2003,7 @@ preprocess_rowmarks(PlannerInfo *root)
        /*
         * We need to have rowmarks for all base relations except the target. We
         * make a bitmapset of all base rels and then remove the items we don't
-        * need or have FOR UPDATE/SHARE marks for.
+        * need or have FOR [KEY] UPDATE/SHARE marks for.
         */
        rels = get_base_rel_indexes((Node *) parse->jointree);
        if (parse->resultRelation)
@@ -2020,7 +2020,7 @@ preprocess_rowmarks(PlannerInfo *root)
                PlanRowMark *newrc;
 
                /*
-                * Currently, it is syntactically impossible to have FOR UPDATE
+                * Currently, it is syntactically impossible to have FOR UPDATE et al
                 * applied to an update/delete target rel.      If that ever becomes
                 * possible, we should drop the target from the PlanRowMark list.
                 */
@@ -2040,10 +2040,21 @@ preprocess_rowmarks(PlannerInfo *root)
                newrc = makeNode(PlanRowMark);
                newrc->rti = newrc->prti = rc->rti;
                newrc->rowmarkId = ++(root->glob->lastRowMarkId);
-               if (rc->forUpdate)
-                       newrc->markType = ROW_MARK_EXCLUSIVE;
-               else
-                       newrc->markType = ROW_MARK_SHARE;
+               switch (rc->strength)
+               {
+                       case LCS_FORUPDATE:
+                               newrc->markType = ROW_MARK_EXCLUSIVE;
+                               break;
+                       case LCS_FORNOKEYUPDATE:
+                               newrc->markType = ROW_MARK_NOKEYEXCLUSIVE;
+                               break;
+                       case LCS_FORSHARE:
+                               newrc->markType = ROW_MARK_SHARE;
+                               break;
+                       case LCS_FORKEYSHARE:
+                               newrc->markType = ROW_MARK_KEYSHARE;
+                               break;
+               }
                newrc->noWait = rc->noWait;
                newrc->isParent = false;
 
index 5aa6ecce7b317ceae5b07576a9d50d6eac1dac22..ede41af6dbc28947e0ccfd416f674e49009c3f3e 100644 (file)
@@ -2139,7 +2139,7 @@ transformCreateTableAsStmt(ParseState *pstate, CreateTableAsStmt *stmt)
 
 
 /*
- * Check for features that are not supported together with FOR UPDATE/SHARE.
+ * Check for features that are not supported together with FOR [KEY] UPDATE/SHARE.
  *
  * exported so planner can check again after rewriting, query pullup, etc
  */
@@ -2149,35 +2149,35 @@ CheckSelectLocking(Query *qry)
        if (qry->setOperations)
                ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                errmsg("SELECT FOR UPDATE/SHARE is not allowed with UNION/INTERSECT/EXCEPT")));
+                                errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with UNION/INTERSECT/EXCEPT")));
        if (qry->distinctClause != NIL)
                ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                errmsg("SELECT FOR UPDATE/SHARE is not allowed with DISTINCT clause")));
+                                errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with DISTINCT clause")));
        if (qry->groupClause != NIL)
                ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                errmsg("SELECT FOR UPDATE/SHARE is not allowed with GROUP BY clause")));
+                                errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with GROUP BY clause")));
        if (qry->havingQual != NULL)
                ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-               errmsg("SELECT FOR UPDATE/SHARE is not allowed with HAVING clause")));
+               errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with HAVING clause")));
        if (qry->hasAggs)
                ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                errmsg("SELECT FOR UPDATE/SHARE is not allowed with aggregate functions")));
+                                errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with aggregate functions")));
        if (qry->hasWindowFuncs)
                ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                errmsg("SELECT FOR UPDATE/SHARE is not allowed with window functions")));
+                                errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with window functions")));
        if (expression_returns_set((Node *) qry->targetList))
                ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                errmsg("SELECT FOR UPDATE/SHARE is not allowed with set-returning functions in the target list")));
+                                errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with set-returning functions in the target list")));
 }
 
 /*
- * Transform a FOR UPDATE/SHARE clause
+ * Transform a FOR [KEY] UPDATE/SHARE clause
  *
  * This basically involves replacing names by integer relids.
  *
@@ -2199,7 +2199,7 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
        /* make a clause we can pass down to subqueries to select all rels */
        allrels = makeNode(LockingClause);
        allrels->lockedRels = NIL;      /* indicates all rels */
-       allrels->forUpdate = lc->forUpdate;
+       allrels->strength = lc->strength;
        allrels->noWait = lc->noWait;
 
        if (lockedRels == NIL)
@@ -2218,15 +2218,15 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
                                        if (rte->relkind == RELKIND_FOREIGN_TABLE)
                                                break;
                                        applyLockingClause(qry, i,
-                                                                          lc->forUpdate, lc->noWait, pushedDown);
+                                                                          lc->strength, lc->noWait, pushedDown);
                                        rte->requiredPerms |= ACL_SELECT_FOR_UPDATE;
                                        break;
                                case RTE_SUBQUERY:
                                        applyLockingClause(qry, i,
-                                                                          lc->forUpdate, lc->noWait, pushedDown);
+                                                                          lc->strength, lc->noWait, pushedDown);
 
                                        /*
-                                        * FOR UPDATE/SHARE of subquery is propagated to all of
+                                        * FOR [KEY] UPDATE/SHARE of subquery is propagated to all of
                                         * subquery's rels, too.  We could do this later (based on
                                         * the marking of the subquery RTE) but it is convenient
                                         * to have local knowledge in each query level about which
@@ -2252,7 +2252,7 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
                        if (thisrel->catalogname || thisrel->schemaname)
                                ereport(ERROR,
                                                (errcode(ERRCODE_SYNTAX_ERROR),
-                                                errmsg("SELECT FOR UPDATE/SHARE must specify unqualified relation names"),
+                                                errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE must specify unqualified relation names"),
                                                 parser_errposition(pstate, thisrel->location)));
 
                        i = 0;
@@ -2269,17 +2269,17 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
                                                        if (rte->relkind == RELKIND_FOREIGN_TABLE)
                                                                ereport(ERROR,
                                                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                                                         errmsg("SELECT FOR UPDATE/SHARE cannot be used with foreign table \"%s\"",
+                                                                         errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be used with foreign table \"%s\"",
                                                                                         rte->eref->aliasname),
                                                                          parser_errposition(pstate, thisrel->location)));
                                                        applyLockingClause(qry, i,
-                                                                                          lc->forUpdate, lc->noWait,
+                                                                                          lc->strength, lc->noWait,
                                                                                           pushedDown);
                                                        rte->requiredPerms |= ACL_SELECT_FOR_UPDATE;
                                                        break;
                                                case RTE_SUBQUERY:
                                                        applyLockingClause(qry, i,
-                                                                                          lc->forUpdate, lc->noWait,
+                                                                                          lc->strength, lc->noWait,
                                                                                           pushedDown);
                                                        /* see comment above */
                                                        transformLockingClause(pstate, rte->subquery,
@@ -2288,25 +2288,25 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
                                                case RTE_JOIN:
                                                        ereport(ERROR,
                                                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                                                        errmsg("SELECT FOR UPDATE/SHARE cannot be applied to a join"),
+                                                                        errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be applied to a join"),
                                                         parser_errposition(pstate, thisrel->location)));
                                                        break;
                                                case RTE_FUNCTION:
                                                        ereport(ERROR,
                                                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                                                        errmsg("SELECT FOR UPDATE/SHARE cannot be applied to a function"),
+                                                                        errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be applied to a function"),
                                                         parser_errposition(pstate, thisrel->location)));
                                                        break;
                                                case RTE_VALUES:
                                                        ereport(ERROR,
                                                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                                                        errmsg("SELECT FOR UPDATE/SHARE cannot be applied to VALUES"),
+                                                                        errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be applied to VALUES"),
                                                         parser_errposition(pstate, thisrel->location)));
                                                        break;
                                                case RTE_CTE:
                                                        ereport(ERROR,
                                                                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                                                        errmsg("SELECT FOR UPDATE/SHARE cannot be applied to a WITH query"),
+                                                                        errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be applied to a WITH query"),
                                                         parser_errposition(pstate, thisrel->location)));
                                                        break;
                                                default:
@@ -2320,7 +2320,7 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
                        if (rt == NULL)
                                ereport(ERROR,
                                                (errcode(ERRCODE_UNDEFINED_TABLE),
-                                                errmsg("relation \"%s\" in FOR UPDATE/SHARE clause not found in FROM clause",
+                                                errmsg("relation \"%s\" in FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE clause not found in FROM clause",
                                                                thisrel->relname),
                                                 parser_errposition(pstate, thisrel->location)));
                }
@@ -2332,7 +2332,7 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
  */
 void
 applyLockingClause(Query *qry, Index rtindex,
-                                  bool forUpdate, bool noWait, bool pushedDown)
+                                  LockClauseStrength strength, bool noWait, bool pushedDown)
 {
        RowMarkClause *rc;
 
@@ -2344,10 +2344,10 @@ applyLockingClause(Query *qry, Index rtindex,
        if ((rc = get_parse_rowmark(qry, rtindex)) != NULL)
        {
                /*
-                * If the same RTE is specified both FOR UPDATE and FOR SHARE, treat
-                * it as FOR UPDATE.  (Reasonable, since you can't take both a shared
-                * and exclusive lock at the same time; it'll end up being exclusive
-                * anyway.)
+                * If the same RTE is specified for more than one locking strength,
+                * treat is as the strongest.  (Reasonable, since you can't take both a
+                * shared and exclusive lock at the same time; it'll end up being
+                * exclusive anyway.)
                 *
                 * We also consider that NOWAIT wins if it's specified both ways. This
                 * is a bit more debatable but raising an error doesn't seem helpful.
@@ -2356,7 +2356,7 @@ applyLockingClause(Query *qry, Index rtindex,
                 *
                 * And of course pushedDown becomes false if any clause is explicit.
                 */
-               rc->forUpdate |= forUpdate;
+               rc->strength = Max(rc->strength, strength);
                rc->noWait |= noWait;
                rc->pushedDown &= pushedDown;
                return;
@@ -2365,7 +2365,7 @@ applyLockingClause(Query *qry, Index rtindex,
        /* Make a new RowMarkClause */
        rc = makeNode(RowMarkClause);
        rc->rti = rtindex;
-       rc->forUpdate = forUpdate;
+       rc->strength = strength;
        rc->noWait = noWait;
        rc->pushedDown = pushedDown;
        qry->rowMarks = lappend(qry->rowMarks, rc);
index b19afa88e735cf8016a19722e4296b5463560c32..828e11058e90040ab33396a7cebe25ec90303d9c 100644 (file)
@@ -361,6 +361,7 @@ static void processCASbits(int cas_bits, int location, const char *constrType,
 %type <ival>    OptTemp
 %type <oncommit> OnCommitOption
 
+%type <ival>   for_locking_strength
 %type <node>   for_locking_item
 %type <list>   for_locking_clause opt_for_locking_clause for_locking_items
 %type <list>   locked_rels_list
@@ -8900,9 +8901,10 @@ select_with_parens:
  * The duplicative productions are annoying, but hard to get rid of without
  * creating shift/reduce conflicts.
  *
- *     FOR UPDATE/SHARE may be before or after LIMIT/OFFSET.
+ *     The locking clause (FOR UPDATE etc) may be before or after LIMIT/OFFSET.
  *     In <=7.2.X, LIMIT/OFFSET had to be after FOR UPDATE
- *     We now support both orderings, but prefer LIMIT/OFFSET before FOR UPDATE/SHARE
+ *     We now support both orderings, but prefer LIMIT/OFFSET before the locking
+ * clause.
  *     2002-08-28 bjm
  */
 select_no_parens:
@@ -9321,24 +9323,23 @@ for_locking_items:
                ;
 
 for_locking_item:
-                       FOR UPDATE locked_rels_list opt_nowait
+                       for_locking_strength locked_rels_list opt_nowait
                                {
                                        LockingClause *n = makeNode(LockingClause);
-                                       n->lockedRels = $3;
-                                       n->forUpdate = TRUE;
-                                       n->noWait = $4;
-                                       $$ = (Node *) n;
-                               }
-                       | FOR SHARE locked_rels_list opt_nowait
-                               {
-                                       LockingClause *n = makeNode(LockingClause);
-                                       n->lockedRels = $3;
-                                       n->forUpdate = FALSE;
-                                       n->noWait = $4;
+                                       n->lockedRels = $2;
+                                       n->strength = $1;
+                                       n->noWait = $3;
                                        $$ = (Node *) n;
                                }
                ;
 
+for_locking_strength:
+                       FOR UPDATE                                                      { $$ = LCS_FORUPDATE; }
+                       | FOR NO KEY UPDATE                             { $$ = LCS_FORNOKEYUPDATE; }
+                       | FOR SHARE                                             { $$ = LCS_FORSHARE; }
+                       | FOR KEY SHARE                                         { $$ = LCS_FORKEYSHARE; }
+               ;
+
 locked_rels_list:
                        OF qualified_name_list                                  { $$ = $2; }
                        | /* EMPTY */                                                   { $$ = NIL; }
index 564edf2e556112ac0c6f3f77781bc9ff272cc4cf..7ab08018876fd929a4f0837946bdb109f64fa6d0 100644 (file)
@@ -69,6 +69,7 @@
 
 #include "access/heapam.h"
 #include "access/htup_details.h"
+#include "access/multixact.h"
 #include "access/reloptions.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -136,8 +137,9 @@ static volatile sig_atomic_t got_SIGHUP = false;
 static volatile sig_atomic_t got_SIGUSR2 = false;
 static volatile sig_atomic_t got_SIGTERM = false;
 
-/* Comparison point for determining whether freeze_max_age is exceeded */
+/* Comparison points for determining whether freeze_max_age is exceeded */
 static TransactionId recentXid;
+static MultiXactId recentMulti;
 
 /* Default freeze ages to use for autovacuum (varies by database) */
 static int     default_freeze_min_age;
@@ -161,6 +163,7 @@ typedef struct avw_dbase
        Oid                     adw_datid;
        char       *adw_name;
        TransactionId adw_frozenxid;
+       MultiXactId     adw_frozenmulti;
        PgStat_StatDBEntry *adw_entry;
 } avw_dbase;
 
@@ -1076,7 +1079,9 @@ do_start_worker(void)
        List       *dblist;
        ListCell   *cell;
        TransactionId xidForceLimit;
+       MultiXactId multiForceLimit;
        bool            for_xid_wrap;
+       bool            for_multi_wrap;
        avw_dbase  *avdb;
        TimestampTz current_time;
        bool            skipit = false;
@@ -1122,12 +1127,20 @@ do_start_worker(void)
        if (xidForceLimit < FirstNormalTransactionId)
                xidForceLimit -= FirstNormalTransactionId;
 
+       /* Also determine the oldest datminmxid we will consider. */
+       recentMulti = ReadNextMultiXactId();
+       multiForceLimit = recentMulti - autovacuum_freeze_max_age;
+       if (multiForceLimit < FirstMultiXactId)
+               multiForceLimit -= FirstMultiXactId;
+
        /*
         * Choose a database to connect to.  We pick the database that was least
         * recently auto-vacuumed, or one that needs vacuuming to prevent Xid
-        * wraparound-related data loss.  If any db at risk of wraparound is
+        * wraparound-related data loss.  If any db at risk of Xid wraparound is
         * found, we pick the one with oldest datfrozenxid, independently of
-        * autovacuum times.
+        * autovacuum times; similarly we pick the one with the oldest datminmxid
+        * if any is in MultiXactId wraparound.  Note that those in Xid wraparound
+        * danger are given more priority than those in multi wraparound danger.
         *
         * Note that a database with no stats entry is not considered, except for
         * Xid wraparound purposes.  The theory is that if no one has ever
@@ -1143,6 +1156,7 @@ do_start_worker(void)
         */
        avdb = NULL;
        for_xid_wrap = false;
+       for_multi_wrap = false;
        current_time = GetCurrentTimestamp();
        foreach(cell, dblist)
        {
@@ -1153,13 +1167,25 @@ do_start_worker(void)
                if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit))
                {
                        if (avdb == NULL ||
-                         TransactionIdPrecedes(tmp->adw_frozenxid, avdb->adw_frozenxid))
+                               TransactionIdPrecedes(tmp->adw_frozenxid,
+                                                                         avdb->adw_frozenxid))
                                avdb = tmp;
                        for_xid_wrap = true;
                        continue;
                }
                else if (for_xid_wrap)
                        continue;                       /* ignore not-at-risk DBs */
+               else if (MultiXactIdPrecedes(tmp->adw_frozenmulti, multiForceLimit))
+               {
+                       if (avdb == NULL ||
+                               MultiXactIdPrecedes(tmp->adw_frozenmulti,
+                                                                       avdb->adw_frozenmulti))
+                               avdb = tmp;
+                       for_multi_wrap = true;
+                       continue;
+               }
+               else if (for_multi_wrap)
+                       continue;                       /* ignore not-at-risk DBs */
 
                /* Find pgstat entry if any */
                tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid);
@@ -1642,6 +1668,7 @@ AutoVacWorkerMain(int argc, char *argv[])
 
                /* And do an appropriate amount of work */
                recentXid = ReadNewTransactionId();
+               recentMulti = ReadNextMultiXactId();
                do_autovacuum();
        }
 
@@ -1847,6 +1874,7 @@ get_database_list(void)
                avdb->adw_datid = HeapTupleGetOid(tup);
                avdb->adw_name = pstrdup(NameStr(pgdatabase->datname));
                avdb->adw_frozenxid = pgdatabase->datfrozenxid;
+               avdb->adw_frozenmulti = pgdatabase->datminmxid;
                /* this gets set later: */
                avdb->adw_entry = NULL;
 
@@ -2601,6 +2629,7 @@ relation_needs_vacanalyze(Oid relid,
        /* freeze parameters */
        int                     freeze_max_age;
        TransactionId xidForceLimit;
+       MultiXactId     multiForceLimit;
 
        AssertArg(classForm != NULL);
        AssertArg(OidIsValid(relid));
@@ -2641,6 +2670,14 @@ relation_needs_vacanalyze(Oid relid,
        force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) &&
                                        TransactionIdPrecedes(classForm->relfrozenxid,
                                                                                  xidForceLimit));
+       if (!force_vacuum)
+       {
+               multiForceLimit = recentMulti - autovacuum_freeze_max_age;
+               if (multiForceLimit < FirstMultiXactId)
+                       multiForceLimit -= FirstMultiXactId;
+               force_vacuum = MultiXactIdPrecedes(classForm->relminmxid,
+                                                                                  multiForceLimit);
+       }
        *wraparound = force_vacuum;
 
        /* User disabled it in pg_class.reloptions?  (But ignore if at risk) */
index 33df6aecd6c2653e603d60a5f66e6d9b44d8cc12..b458de697112053cecaf1f31927ccc13b4a1cdc7 100644 (file)
@@ -55,7 +55,7 @@ static void rewriteValuesRTE(RangeTblEntry *rte, Relation target_relation,
 static void rewriteTargetListUD(Query *parsetree, RangeTblEntry *target_rte,
                                        Relation target_relation);
 static void markQueryForLocking(Query *qry, Node *jtnode,
-                                       bool forUpdate, bool noWait, bool pushedDown);
+                                       LockClauseStrength strength, bool noWait, bool pushedDown);
 static List *matchLocks(CmdType event, RuleLock *rulelocks,
                   int varno, Query *parsetree);
 static Query *fireRIRrules(Query *parsetree, List *activeRIRs,
@@ -68,7 +68,7 @@ static Query *fireRIRrules(Query *parsetree, List *activeRIRs,
  *       These locks will ensure that the relation schemas don't change under us
  *       while we are rewriting and planning the query.
  *
- * forUpdatePushedDown indicates that a pushed-down FOR UPDATE/SHARE applies
+ * forUpdatePushedDown indicates that a pushed-down FOR [KEY] UPDATE/SHARE applies
  * to the current subquery, requiring all rels to be opened with RowShareLock.
  * This should always be false at the start of the recursion.
  *
@@ -130,7 +130,7 @@ AcquireRewriteLocks(Query *parsetree, bool forUpdatePushedDown)
                                 *
                                 * If the relation is the query's result relation, then we
                                 * need RowExclusiveLock.  Otherwise, check to see if the
-                                * relation is accessed FOR UPDATE/SHARE or not.  We can't
+                                * relation is accessed FOR [KEY] UPDATE/SHARE or not.  We can't
                                 * just grab AccessShareLock because then the executor would
                                 * be trying to upgrade the lock, leading to possible
                                 * deadlocks.
@@ -1357,7 +1357,7 @@ ApplyRetrieveRule(Query *parsetree,
        }
 
        /*
-        * If FOR UPDATE/SHARE of view, be sure we get right initial lock on the
+        * If FOR [KEY] UPDATE/SHARE of view, be sure we get right initial lock on the
         * relations it references.
         */
        rc = get_parse_rowmark(parsetree, rt_index);
@@ -1405,8 +1405,8 @@ ApplyRetrieveRule(Query *parsetree,
        rte->modifiedCols = NULL;
 
        /*
-        * If FOR UPDATE/SHARE of view, mark all the contained tables as implicit
-        * FOR UPDATE/SHARE, the same as the parser would have done if the view's
+        * If FOR [KEY] UPDATE/SHARE of view, mark all the contained tables as implicit
+        * FOR [KEY] UPDATE/SHARE, the same as the parser would have done if the view's
         * subquery had been written out explicitly.
         *
         * Note: we don't consider forUpdatePushedDown here; such marks will be
@@ -1414,13 +1414,13 @@ ApplyRetrieveRule(Query *parsetree,
         */
        if (rc != NULL)
                markQueryForLocking(rule_action, (Node *) rule_action->jointree,
-                                                       rc->forUpdate, rc->noWait, true);
+                                                       rc->strength, rc->noWait, true);
 
        return parsetree;
 }
 
 /*
- * Recursively mark all relations used by a view as FOR UPDATE/SHARE.
+ * Recursively mark all relations used by a view as FOR [KEY] UPDATE/SHARE.
  *
  * This may generate an invalid query, eg if some sub-query uses an
  * aggregate.  We leave it to the planner to detect that.
@@ -1432,7 +1432,7 @@ ApplyRetrieveRule(Query *parsetree,
  */
 static void
 markQueryForLocking(Query *qry, Node *jtnode,
-                                       bool forUpdate, bool noWait, bool pushedDown)
+                                       LockClauseStrength strength, bool noWait, bool pushedDown)
 {
        if (jtnode == NULL)
                return;
@@ -1446,16 +1446,16 @@ markQueryForLocking(Query *qry, Node *jtnode,
                        /* ignore foreign tables */
                        if (rte->relkind != RELKIND_FOREIGN_TABLE)
                        {
-                               applyLockingClause(qry, rti, forUpdate, noWait, pushedDown);
+                               applyLockingClause(qry, rti, strength, noWait, pushedDown);
                                rte->requiredPerms |= ACL_SELECT_FOR_UPDATE;
                        }
                }
                else if (rte->rtekind == RTE_SUBQUERY)
                {
-                       applyLockingClause(qry, rti, forUpdate, noWait, pushedDown);
-                       /* FOR UPDATE/SHARE of subquery is propagated to subquery's rels */
+                       applyLockingClause(qry, rti, strength, noWait, pushedDown);
+                       /* FOR [KEY] UPDATE/SHARE of subquery is propagated to subquery's rels */
                        markQueryForLocking(rte->subquery, (Node *) rte->subquery->jointree,
-                                                               forUpdate, noWait, true);
+                                                               strength, noWait, true);
                }
                /* other RTE types are unaffected by FOR UPDATE */
        }
@@ -1465,14 +1465,14 @@ markQueryForLocking(Query *qry, Node *jtnode,
                ListCell   *l;
 
                foreach(l, f->fromlist)
-                       markQueryForLocking(qry, lfirst(l), forUpdate, noWait, pushedDown);
+                       markQueryForLocking(qry, lfirst(l), strength, noWait, pushedDown);
        }
        else if (IsA(jtnode, JoinExpr))
        {
                JoinExpr   *j = (JoinExpr *) jtnode;
 
-               markQueryForLocking(qry, j->larg, forUpdate, noWait, pushedDown);
-               markQueryForLocking(qry, j->rarg, forUpdate, noWait, pushedDown);
+               markQueryForLocking(qry, j->larg, strength, noWait, pushedDown);
+               markQueryForLocking(qry, j->rarg, strength, noWait, pushedDown);
        }
        else
                elog(ERROR, "unrecognized node type: %d",
index 84637fe5815db546d9e2da0717e7ef0fafb164bd..2c1c6524897dc207bcfc0156de0542332f79845e 100644 (file)
@@ -538,6 +538,20 @@ ProcLockHashCode(const PROCLOCKTAG *proclocktag, uint32 hashcode)
        return lockhash;
 }
 
+/*
+ * Given two lock modes, return whether they would conflict.
+ */
+bool
+DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
+{
+       LockMethod      lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD];
+
+       if (lockMethodTable->conflictTab[mode1] & LOCKBIT_ON(mode2))
+               return true;
+
+       return false;
+}
+
 /*
  * LockHasWaiters -- look up 'locktag' and check if releasing this
  *             lock would wake up other processes waiting for it.
@@ -630,7 +644,6 @@ LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
        return hasWaiters;
 }
 
-
 /*
  * LockAcquire -- Check for lock conflicts, sleep if conflict found,
  *             set lock if/when no conflicts.
index 90a9e2a915592337b7e17f24b32b40d3f82121d7..51c350797d1724e4cb359b261faf1740be60a50f 100644 (file)
@@ -3905,10 +3905,10 @@ CheckForSerializableConflictOut(bool visible, Relation relation,
                case HEAPTUPLE_RECENTLY_DEAD:
                        if (!visible)
                                return;
-                       xid = HeapTupleHeaderGetXmax(tuple->t_data);
+                       xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
                        break;
                case HEAPTUPLE_DELETE_IN_PROGRESS:
-                       xid = HeapTupleHeaderGetXmax(tuple->t_data);
+                       xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
                        break;
                case HEAPTUPLE_INSERT_IN_PROGRESS:
                        xid = HeapTupleHeaderGetXmin(tuple->t_data);
index 598e20f91c3685db73e16eaaf8e9ef4f16069182..8904c6f2dac4558fde14ace21f935217f8610474 100644 (file)
@@ -131,7 +131,7 @@ CommandIsReadOnly(Node *parsetree)
                {
                        case CMD_SELECT:
                                if (stmt->rowMarks != NIL)
-                                       return false;           /* SELECT FOR UPDATE/SHARE */
+                                       return false;           /* SELECT FOR [KEY] UPDATE/SHARE */
                                else if (stmt->hasModifyingCTE)
                                        return false;           /* data-modifying CTE */
                                else
@@ -2283,10 +2283,28 @@ CreateCommandTag(Node *parsetree)
                                                else if (stmt->rowMarks != NIL)
                                                {
                                                        /* not 100% but probably close enough */
-                                                       if (((PlanRowMark *) linitial(stmt->rowMarks))->markType == ROW_MARK_EXCLUSIVE)
-                                                               tag = "SELECT FOR UPDATE";
-                                                       else
-                                                               tag = "SELECT FOR SHARE";
+                                                       switch (((PlanRowMark *) linitial(stmt->rowMarks))->markType)
+                                                       {
+                                                               case ROW_MARK_EXCLUSIVE:
+                                                                       tag = "SELECT FOR UPDATE";
+                                                                       break;
+                                                               case ROW_MARK_NOKEYEXCLUSIVE:
+                                                                       tag = "SELECT FOR NO KEY UPDATE";
+                                                                       break;
+                                                               case ROW_MARK_SHARE:
+                                                                       tag = "SELECT FOR SHARE";
+                                                                       break;
+                                                               case ROW_MARK_KEYSHARE:
+                                                                       tag = "SELECT FOR KEY SHARE";
+                                                                       break;
+                                                               case ROW_MARK_REFERENCE:
+                                                               case ROW_MARK_COPY:
+                                                                       tag = "SELECT";
+                                                                       break;
+                                                               default:
+                                                                       tag = "???";
+                                                                       break;
+                                                       }
                                                }
                                                else
                                                        tag = "SELECT";
@@ -2331,10 +2349,24 @@ CreateCommandTag(Node *parsetree)
                                                else if (stmt->rowMarks != NIL)
                                                {
                                                        /* not 100% but probably close enough */
-                                                       if (((RowMarkClause *) linitial(stmt->rowMarks))->forUpdate)
-                                                               tag = "SELECT FOR UPDATE";
-                                                       else
-                                                               tag = "SELECT FOR SHARE";
+                                                       switch (((RowMarkClause *) linitial(stmt->rowMarks))->strength)
+                                                       {
+                                                               case LCS_FORKEYSHARE:
+                                                                       tag = "SELECT FOR KEY SHARE";
+                                                                       break;
+                                                               case LCS_FORSHARE:
+                                                                       tag = "SELECT FOR SHARE";
+                                                                       break;
+                                                               case LCS_FORNOKEYUPDATE:
+                                                                       tag = "SELECT FOR NO KEY UPDATE";
+                                                                       break;
+                                                               case LCS_FORUPDATE:
+                                                                       tag = "SELECT FOR UPDATE";
+                                                                       break;
+                                                               default:
+                                                                       tag =  "???";
+                                                                       break;
+                                                       }
                                                }
                                                else
                                                        tag = "SELECT";
index 601d5ec861f19943363844b2969df3222b88133f..243bdebbd224841cb4e10c80eaac57def85b72db 100644 (file)
@@ -299,7 +299,7 @@ RI_FKey_check(TriggerData *trigdata)
         * Get the relation descriptors of the FK and PK tables.
         *
         * pk_rel is opened in RowShareLock mode since that's what our eventual
-        * SELECT FOR SHARE will get on it.
+        * SELECT FOR KEY SHARE will get on it.
         */
        fk_rel = trigdata->tg_relation;
        pk_rel = heap_open(riinfo->pk_relid, RowShareLock);
@@ -400,7 +400,8 @@ RI_FKey_check(TriggerData *trigdata)
 
                /* ----------
                 * The query string built is
-                *      SELECT 1 FROM ONLY <pktable> WHERE pkatt1 = $1 [AND ...] FOR SHARE
+                *      SELECT 1 FROM ONLY <pktable> x WHERE pkatt1 = $1 [AND ...]
+                *             FOR KEY SHARE OF x
                 * The type id's for the $ parameters are those of the
                 * corresponding FK attributes.
                 * ----------
@@ -424,7 +425,7 @@ RI_FKey_check(TriggerData *trigdata)
                        querysep = "AND";
                        queryoids[i] = fk_type;
                }
-               appendStringInfo(&querybuf, " FOR SHARE OF x");
+               appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
                /* Prepare and save the plan */
                qplan = ri_PlanCheck(querybuf.data, riinfo->nkeys, queryoids,
@@ -535,7 +536,8 @@ ri_Check_Pk_Match(Relation pk_rel, Relation fk_rel,
 
                /* ----------
                 * The query string built is
-                *      SELECT 1 FROM ONLY <pktable> WHERE pkatt1 = $1 [AND ...] FOR SHARE
+                *      SELECT 1 FROM ONLY <pktable> x WHERE pkatt1 = $1 [AND ...]
+                *             FOR KEY SHARE OF x
                 * The type id's for the $ parameters are those of the
                 * PK attributes themselves.
                 * ----------
@@ -558,7 +560,7 @@ ri_Check_Pk_Match(Relation pk_rel, Relation fk_rel,
                        querysep = "AND";
                        queryoids[i] = pk_type;
                }
-               appendStringInfo(&querybuf, " FOR SHARE OF x");
+               appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
                /* Prepare and save the plan */
                qplan = ri_PlanCheck(querybuf.data, riinfo->nkeys, queryoids,
@@ -655,7 +657,7 @@ ri_restrict_del(TriggerData *trigdata, bool is_no_action)
         * Get the relation descriptors of the FK and PK tables and the old tuple.
         *
         * fk_rel is opened in RowShareLock mode since that's what our eventual
-        * SELECT FOR SHARE will get on it.
+        * SELECT FOR KEY SHARE will get on it.
         */
        fk_rel = heap_open(riinfo->fk_relid, RowShareLock);
        pk_rel = trigdata->tg_relation;
@@ -724,7 +726,8 @@ ri_restrict_del(TriggerData *trigdata, bool is_no_action)
 
                                /* ----------
                                 * The query string built is
-                                *      SELECT 1 FROM ONLY <fktable> WHERE $1 = fkatt1 [AND ...]
+                                *      SELECT 1 FROM ONLY <fktable> x WHERE $1 = fkatt1 [AND ...]
+                                *             FOR KEY SHARE OF x
                                 * The type id's for the $ parameters are those of the
                                 * corresponding PK attributes.
                                 * ----------
@@ -749,7 +752,7 @@ ri_restrict_del(TriggerData *trigdata, bool is_no_action)
                                        querysep = "AND";
                                        queryoids[i] = pk_type;
                                }
-                               appendStringInfo(&querybuf, " FOR SHARE OF x");
+                               appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
                                /* Prepare and save the plan */
                                qplan = ri_PlanCheck(querybuf.data, riinfo->nkeys, queryoids,
@@ -868,7 +871,7 @@ ri_restrict_upd(TriggerData *trigdata, bool is_no_action)
         * old tuple.
         *
         * fk_rel is opened in RowShareLock mode since that's what our eventual
-        * SELECT FOR SHARE will get on it.
+        * SELECT FOR KEY SHARE will get on it.
         */
        fk_rel = heap_open(riinfo->fk_relid, RowShareLock);
        pk_rel = trigdata->tg_relation;
@@ -972,7 +975,7 @@ ri_restrict_upd(TriggerData *trigdata, bool is_no_action)
                                        querysep = "AND";
                                        queryoids[i] = pk_type;
                                }
-                               appendStringInfo(&querybuf, " FOR SHARE OF x");
+                               appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
                                /* Prepare and save the plan */
                                qplan = ri_PlanCheck(querybuf.data, riinfo->nkeys, queryoids,
index af10471581795f91899c022ef927cd3362cbf530..16f56c6adec43f6e5c7949b4e90d46012403ae95 100644 (file)
@@ -4194,7 +4194,7 @@ get_select_query_def(Query *query, deparse_context *context,
                        get_rule_expr(query->limitCount, context, false);
        }
 
-       /* Add FOR UPDATE/SHARE clauses if present */
+       /* Add FOR [KEY] UPDATE/SHARE clauses if present */
        if (query->hasForUpdate)
        {
                foreach(l, query->rowMarks)
@@ -4205,12 +4205,26 @@ get_select_query_def(Query *query, deparse_context *context,
                        if (rc->pushedDown)
                                continue;
 
-                       if (rc->forUpdate)
-                               appendContextKeyword(context, " FOR UPDATE",
-                                                                        -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
-                       else
-                               appendContextKeyword(context, " FOR SHARE",
-                                                                        -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+                       switch (rc->strength)
+                       {
+                               case LCS_FORKEYSHARE:
+                                       appendContextKeyword(context, " FOR KEY SHARE",
+                                                                                -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+                                       break;
+                               case LCS_FORSHARE:
+                                       appendContextKeyword(context, " FOR SHARE",
+                                                                                -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+                                       break;
+                               case LCS_FORNOKEYUPDATE:
+                                       appendContextKeyword(context, " FOR NO KEY UPDATE",
+                                                                                -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+                                       break;
+                               case LCS_FORUPDATE:
+                                       appendContextKeyword(context, " FOR UPDATE",
+                                                                                -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+                                       break;
+                       }
+
                        appendStringInfo(buf, " OF %s",
                                                         quote_identifier(get_rtable_name(rc->rti,
                                                                                                                          context)));
index 40238e959e6a7f42437730fb69619008d57a6287..fa48b1ce1abc585b90ac390a750c2fab9c5c400c 100644 (file)
 #include <fcntl.h>
 #include <unistd.h>
 
+#include "access/htup_details.h"
+#include "access/multixact.h"
 #include "access/reloptions.h"
 #include "access/sysattr.h"
 #include "access/transam.h"
-#include "access/htup_details.h"
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "catalog/index.h"
@@ -2725,7 +2726,8 @@ RelationBuildLocalRelation(const char *relname,
  * the XIDs that will be put into the new relation contents.
  */
 void
-RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid)
+RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid,
+                                                 MultiXactId minmulti)
 {
        Oid                     newrelfilenode;
        RelFileNodeBackend newrnode;
@@ -2738,6 +2740,7 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid)
                        relation->rd_rel->relkind == RELKIND_SEQUENCE) ?
                   freezeXid == InvalidTransactionId :
                   TransactionIdIsNormal(freezeXid));
+       Assert(TransactionIdIsNormal(freezeXid) == MultiXactIdIsValid(minmulti));
 
        /* Allocate a new relfilenode */
        newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL,
@@ -2793,6 +2796,7 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid)
                classform->relallvisible = 0;
        }
        classform->relfrozenxid = freezeXid;
+       classform->relminmxid = minmulti;
 
        simple_heap_update(pg_class, &tuple->t_self, tuple);
        CatalogUpdateIndexes(pg_class, tuple);
@@ -3764,6 +3768,9 @@ RelationGetIndexPredicate(Relation relation)
  * simple index keys, but attributes used in expressions and partial-index
  * predicates.)
  *
+ * If "keyAttrs" is true, only attributes that can be referenced by foreign
+ * keys are considered.
+ *
  * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that
  * we can include system attributes (e.g., OID) in the bitmap representation.
  *
@@ -3775,16 +3782,17 @@ RelationGetIndexPredicate(Relation relation)
  * be bms_free'd when not needed anymore.
  */
 Bitmapset *
-RelationGetIndexAttrBitmap(Relation relation)
+RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs)
 {
        Bitmapset  *indexattrs;
+       Bitmapset  *uindexattrs;
        List       *indexoidlist;
        ListCell   *l;
        MemoryContext oldcxt;
 
        /* Quick exit if we already computed the result. */
        if (relation->rd_indexattr != NULL)
-               return bms_copy(relation->rd_indexattr);
+               return bms_copy(keyAttrs ? relation->rd_keyattr : relation->rd_indexattr);
 
        /* Fast path if definitely no indexes */
        if (!RelationGetForm(relation)->relhasindex)
@@ -3810,26 +3818,38 @@ RelationGetIndexAttrBitmap(Relation relation)
         * won't be returned at all by RelationGetIndexList.
         */
        indexattrs = NULL;
+       uindexattrs = NULL;
        foreach(l, indexoidlist)
        {
                Oid                     indexOid = lfirst_oid(l);
                Relation        indexDesc;
                IndexInfo  *indexInfo;
                int                     i;
+               bool            isKey;
 
                indexDesc = index_open(indexOid, AccessShareLock);
 
                /* Extract index key information from the index's pg_index row */
                indexInfo = BuildIndexInfo(indexDesc);
 
+               /* Can this index be referenced by a foreign key? */
+               isKey = indexInfo->ii_Unique &&
+                               indexInfo->ii_Expressions == NIL &&
+                               indexInfo->ii_Predicate == NIL;
+
                /* Collect simple attribute references */
                for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
                {
                        int                     attrnum = indexInfo->ii_KeyAttrNumbers[i];
 
                        if (attrnum != 0)
+                       {
                                indexattrs = bms_add_member(indexattrs,
                                                           attrnum - FirstLowInvalidHeapAttributeNumber);
+                               if (isKey)
+                                       uindexattrs = bms_add_member(uindexattrs,
+                                                                                                attrnum - FirstLowInvalidHeapAttributeNumber);
+                       }
                }
 
                /* Collect all attributes used in expressions, too */
@@ -3846,10 +3866,11 @@ RelationGetIndexAttrBitmap(Relation relation)
        /* Now save a copy of the bitmap in the relcache entry. */
        oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
        relation->rd_indexattr = bms_copy(indexattrs);
+       relation->rd_keyattr = bms_copy(uindexattrs);
        MemoryContextSwitchTo(oldcxt);
 
        /* We return our original working copy for caller to play with */
-       return indexattrs;
+       return keyAttrs ? uindexattrs : indexattrs;
 }
 
 /*
index 38f702892f766fdf27916b1f6abbf6a1e47f2e86..923355d3ceb4f3e49a584ab3c420c7b31b0272ae 100644 (file)
@@ -118,9 +118,8 @@ HeapTupleHeaderGetCmax(HeapTupleHeader tup)
 {
        CommandId       cid = HeapTupleHeaderGetRawCommandId(tup);
 
-       /* We do not store cmax when locking a tuple */
-       Assert(!(tup->t_infomask & (HEAP_MOVED | HEAP_IS_LOCKED)));
-       Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tup)));
+       Assert(!(tup->t_infomask & HEAP_MOVED));
+       Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tup)));
 
        if (tup->t_infomask & HEAP_COMBOCID)
                return GetRealCmax(cid);
index 51f0afded98d468a9b53478ff465f50157aa87ab..f2c9ff2e1c16860d779d7538deaed758c92719ef 100644 (file)
@@ -214,12 +214,25 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
                        if (tuple->t_infomask & HEAP_XMAX_INVALID)      /* xid invalid */
                                return true;
 
-                       if (tuple->t_infomask & HEAP_IS_LOCKED)         /* not deleter */
+                       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))                /* not deleter */
                                return true;
 
-                       Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+                       if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+                       {
+                               TransactionId   xmax;
+
+                               xmax = HeapTupleGetUpdateXid(tuple);
+                               if (!TransactionIdIsValid(xmax))
+                                       return true;
+
+                               /* updating subtransaction must have aborted */
+                               if (!TransactionIdIsCurrentTransactionId(xmax))
+                                       return true;
+                               else
+                                       return false;
+                       }
 
-                       if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+                       if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
                        {
                                /* deleting subtransaction must have aborted */
                                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -250,29 +263,41 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
        if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
        {
-               if (tuple->t_infomask & HEAP_IS_LOCKED)
+               if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
                        return true;
                return false;                   /* updated by other */
        }
 
        if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
        {
-               /* MultiXacts are currently only allowed to lock tuples */
-               Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+               TransactionId   xmax;
+
+               if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+                       return true;
+
+               xmax = HeapTupleGetUpdateXid(tuple);
+               if (!TransactionIdIsValid(xmax))
+                       return true;
+               if (TransactionIdIsCurrentTransactionId(xmax))
+                       return false;
+               if (TransactionIdIsInProgress(xmax))
+                       return true;
+               if (TransactionIdDidCommit(xmax))
+                       return false;
                return true;
        }
 
-       if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+       if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
        {
-               if (tuple->t_infomask & HEAP_IS_LOCKED)
+               if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
                        return true;
                return false;
        }
 
-       if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+       if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
                return true;
 
-       if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+       if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
        {
                /* it must have aborted or crashed */
                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -282,7 +307,7 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
        /* xmax transaction committed */
 
-       if (tuple->t_infomask & HEAP_IS_LOCKED)
+       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
        {
                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
                                        InvalidTransactionId);
@@ -290,7 +315,7 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
        }
 
        SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-                               HeapTupleHeaderGetXmax(tuple));
+                               HeapTupleHeaderGetRawXmax(tuple));
        return false;
 }
 
@@ -380,12 +405,25 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
                        if (tuple->t_infomask & HEAP_XMAX_INVALID)      /* xid invalid */
                                return true;
 
-                       if (tuple->t_infomask & HEAP_IS_LOCKED)         /* not deleter */
+                       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))                /* not deleter */
                                return true;
 
-                       Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+                       if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+                       {
+                               TransactionId   xmax;
 
-                       if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+                               xmax = HeapTupleGetUpdateXid(tuple);
+                               if (!TransactionIdIsValid(xmax))
+                                       return true;
+
+                               /* updating subtransaction must have aborted */
+                               if (!TransactionIdIsCurrentTransactionId(xmax))
+                                       return true;
+                               else
+                                       return false;
+                       }
+
+                       if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
                        {
                                /* deleting subtransaction must have aborted */
                                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -419,21 +457,38 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
        if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
        {
-               if (tuple->t_infomask & HEAP_IS_LOCKED)
+               if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
                        return true;
                return false;
        }
 
        if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
        {
-               /* MultiXacts are currently only allowed to lock tuples */
-               Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+               TransactionId   xmax;
+
+               if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+                       return true;
+
+               xmax = HeapTupleGetUpdateXid(tuple);
+               if (!TransactionIdIsValid(xmax))
+                       return true;
+               if (TransactionIdIsCurrentTransactionId(xmax))
+               {
+                       if (HeapTupleHeaderGetCmax(tuple) >= GetCurrentCommandId(false))
+                               return true;    /* deleted after scan started */
+                       else
+                               return false;   /* deleted before scan started */
+               }
+               if (TransactionIdIsInProgress(xmax))
+                       return true;
+               if (TransactionIdDidCommit(xmax))
+                       return false;
                return true;
        }
 
-       if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+       if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
        {
-               if (tuple->t_infomask & HEAP_IS_LOCKED)
+               if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
                        return true;
                if (HeapTupleHeaderGetCmax(tuple) >= GetCurrentCommandId(false))
                        return true;            /* deleted after scan started */
@@ -441,10 +496,10 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
                        return false;           /* deleted before scan started */
        }
 
-       if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+       if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
                return true;
 
-       if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+       if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
        {
                /* it must have aborted or crashed */
                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -454,7 +509,7 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
        /* xmax transaction committed */
 
-       if (tuple->t_infomask & HEAP_IS_LOCKED)
+       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
        {
                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
                                        InvalidTransactionId);
@@ -462,7 +517,7 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
        }
 
        SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-                               HeapTupleHeaderGetXmax(tuple));
+                               HeapTupleHeaderGetRawXmax(tuple));
        return false;
 }
 
@@ -627,12 +682,30 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
                        if (tuple->t_infomask & HEAP_XMAX_INVALID)      /* xid invalid */
                                return HeapTupleMayBeUpdated;
 
-                       if (tuple->t_infomask & HEAP_IS_LOCKED)         /* not deleter */
+                       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))                 /* not deleter */
                                return HeapTupleMayBeUpdated;
 
-                       Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+                       if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+                       {
+                               TransactionId   xmax;
+
+                               xmax = HeapTupleGetUpdateXid(tuple);
+                               if (!TransactionIdIsValid(xmax))
+                                       return HeapTupleMayBeUpdated;
+
+                               /* updating subtransaction must have aborted */
+                               if (!TransactionIdIsCurrentTransactionId(xmax))
+                                       return HeapTupleMayBeUpdated;
+                               else
+                               {
+                                       if (HeapTupleHeaderGetCmax(tuple) >= curcid)
+                                               return HeapTupleSelfUpdated;    /* updated after scan started */
+                                       else
+                                               return HeapTupleInvisible;      /* updated before scan started */
+                               }
+                       }
 
-                       if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+                       if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
                        {
                                /* deleting subtransaction must have aborted */
                                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -666,26 +739,62 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 
        if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
        {
-               if (tuple->t_infomask & HEAP_IS_LOCKED)
+               if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
                        return HeapTupleMayBeUpdated;
                return HeapTupleUpdated;        /* updated by other */
        }
 
        if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
        {
-               /* MultiXacts are currently only allowed to lock tuples */
-               Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+               TransactionId   xmax;
 
-               if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple)))
+               if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+               {
+                       /*
+                        * If it's only locked but neither EXCL_LOCK nor KEYSHR_LOCK
+                        * is set, it cannot possibly be running.  Otherwise need to
+                        * check.
+                        */
+                       if ((tuple->t_infomask & (HEAP_XMAX_EXCL_LOCK |
+                                                                         HEAP_XMAX_KEYSHR_LOCK)) &&
+                               MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple)))
+                               return HeapTupleBeingUpdated;
+
+                       SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId);
+                       return HeapTupleMayBeUpdated;
+               }
+
+               xmax = HeapTupleGetUpdateXid(tuple);
+               if (!TransactionIdIsValid(xmax))
+               {
+                       if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple)))
+                               return HeapTupleBeingUpdated;
+
+                       SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId);
+                       return HeapTupleMayBeUpdated;
+               }
+
+               if (TransactionIdIsCurrentTransactionId(xmax))
+               {
+                       if (HeapTupleHeaderGetCmax(tuple) >= curcid)
+                               return HeapTupleSelfUpdated;            /* updated after scan started */
+                       else
+                               return HeapTupleInvisible;      /* updated before scan started */
+               }
+
+               if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple)))
                        return HeapTupleBeingUpdated;
-               SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-                                       InvalidTransactionId);
+
+               if (TransactionIdDidCommit(xmax))
+                       return HeapTupleUpdated;
+               /* it must have aborted or crashed */
+               SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId);
                return HeapTupleMayBeUpdated;
        }
 
-       if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+       if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
        {
-               if (tuple->t_infomask & HEAP_IS_LOCKED)
+               if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
                        return HeapTupleMayBeUpdated;
                if (HeapTupleHeaderGetCmax(tuple) >= curcid)
                        return HeapTupleSelfUpdated;            /* updated after scan started */
@@ -693,10 +802,10 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
                        return HeapTupleInvisible;      /* updated before scan started */
        }
 
-       if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+       if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
                return HeapTupleBeingUpdated;
 
-       if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+       if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
        {
                /* it must have aborted or crashed */
                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -706,7 +815,7 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 
        /* xmax transaction committed */
 
-       if (tuple->t_infomask & HEAP_IS_LOCKED)
+       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
        {
                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
                                        InvalidTransactionId);
@@ -714,7 +823,7 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
        }
 
        SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-                               HeapTupleHeaderGetXmax(tuple));
+                               HeapTupleHeaderGetRawXmax(tuple));
        return HeapTupleUpdated;        /* updated by other */
 }
 
@@ -793,12 +902,25 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
                        if (tuple->t_infomask & HEAP_XMAX_INVALID)      /* xid invalid */
                                return true;
 
-                       if (tuple->t_infomask & HEAP_IS_LOCKED)         /* not deleter */
+                       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))                 /* not deleter */
                                return true;
 
-                       Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+                       if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+                       {
+                               TransactionId   xmax;
 
-                       if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+                               xmax = HeapTupleGetUpdateXid(tuple);
+                               if (!TransactionIdIsValid(xmax))
+                                       return true;
+
+                               /* updating subtransaction must have aborted */
+                               if (!TransactionIdIsCurrentTransactionId(xmax))
+                                       return true;
+                               else
+                                       return false;
+                       }
+
+                       if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
                        {
                                /* deleting subtransaction must have aborted */
                                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -833,32 +955,47 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 
        if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
        {
-               if (tuple->t_infomask & HEAP_IS_LOCKED)
+               if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
                        return true;
                return false;                   /* updated by other */
        }
 
        if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
        {
-               /* MultiXacts are currently only allowed to lock tuples */
-               Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+               TransactionId   xmax;
+
+               if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+                       return true;
+
+               xmax = HeapTupleGetUpdateXid(tuple);
+               if (!TransactionIdIsValid(xmax))
+                       return true;
+               if (TransactionIdIsCurrentTransactionId(xmax))
+                       return false;
+               if (TransactionIdIsInProgress(xmax))
+               {
+                       snapshot->xmax = xmax;
+                       return true;
+               }
+               if (TransactionIdDidCommit(xmax))
+                       return false;
                return true;
        }
 
-       if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+       if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
        {
-               if (tuple->t_infomask & HEAP_IS_LOCKED)
+               if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
                        return true;
                return false;
        }
 
-       if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+       if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
        {
-               snapshot->xmax = HeapTupleHeaderGetXmax(tuple);
+               snapshot->xmax = HeapTupleHeaderGetRawXmax(tuple);
                return true;
        }
 
-       if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+       if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
        {
                /* it must have aborted or crashed */
                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -868,7 +1005,7 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 
        /* xmax transaction committed */
 
-       if (tuple->t_infomask & HEAP_IS_LOCKED)
+       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
        {
                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
                                        InvalidTransactionId);
@@ -876,7 +1013,7 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
        }
 
        SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-                               HeapTupleHeaderGetXmax(tuple));
+                               HeapTupleHeaderGetRawXmax(tuple));
        return false;                           /* updated by other */
 }
 
@@ -957,12 +1094,27 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
                        if (tuple->t_infomask & HEAP_XMAX_INVALID)      /* xid invalid */
                                return true;
 
-                       if (tuple->t_infomask & HEAP_IS_LOCKED)         /* not deleter */
+                       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))                 /* not deleter */
                                return true;
 
-                       Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+                       if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+                       {
+                               TransactionId   xmax;
+
+                               xmax = HeapTupleGetUpdateXid(tuple);
+                               if (!TransactionIdIsValid(xmax))
+                                       return true;
+
+                               /* updating subtransaction must have aborted */
+                               if (!TransactionIdIsCurrentTransactionId(xmax))
+                                       return true;
+                               else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+                                       return true;    /* updated after scan started */
+                               else
+                                       return false;   /* updated before scan started */
+                       }
 
-                       if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+                       if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
                        {
                                /* deleting subtransaction must have aborted */
                                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -999,19 +1151,41 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
        if (tuple->t_infomask & HEAP_XMAX_INVALID)      /* xid invalid or aborted */
                return true;
 
-       if (tuple->t_infomask & HEAP_IS_LOCKED)
+       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
                return true;
 
        if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
        {
-               /* MultiXacts are currently only allowed to lock tuples */
-               Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+               TransactionId   xmax;
+
+               /* already checked above */
+               Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
+
+               xmax = HeapTupleGetUpdateXid(tuple);
+               if (!TransactionIdIsValid(xmax))
+                       return true;
+               if (TransactionIdIsCurrentTransactionId(xmax))
+               {
+                       if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+                               return true;    /* deleted after scan started */
+                       else
+                               return false;   /* deleted before scan started */
+               }
+               if (TransactionIdIsInProgress(xmax))
+                       return true;
+               if (TransactionIdDidCommit(xmax))
+               {
+                       /* updating transaction committed, but when? */
+                       if (XidInMVCCSnapshot(xmax, snapshot))
+                               return true;    /* treat as still in progress */
+                       return false;
+               }
                return true;
        }
 
        if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
        {
-               if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+               if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
                {
                        if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
                                return true;    /* deleted after scan started */
@@ -1019,10 +1193,10 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
                                return false;   /* deleted before scan started */
                }
 
-               if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+               if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
                        return true;
 
-               if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+               if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
                {
                        /* it must have aborted or crashed */
                        SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -1032,13 +1206,13 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
 
                /* xmax transaction committed */
                SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-                                       HeapTupleHeaderGetXmax(tuple));
+                                       HeapTupleHeaderGetRawXmax(tuple));
        }
 
        /*
         * OK, the deleting transaction committed too ... but when?
         */
-       if (XidInMVCCSnapshot(HeapTupleHeaderGetXmax(tuple), snapshot))
+       if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot))
                return true;                    /* treat as still in progress */
 
        return false;
@@ -1112,7 +1286,7 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
                {
                        if (tuple->t_infomask & HEAP_XMAX_INVALID)      /* xid invalid */
                                return HEAPTUPLE_INSERT_IN_PROGRESS;
-                       if (tuple->t_infomask & HEAP_IS_LOCKED)
+                       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
                                return HEAPTUPLE_INSERT_IN_PROGRESS;
                        /* inserted and then deleted by same xact */
                        return HEAPTUPLE_DELETE_IN_PROGRESS;
@@ -1144,7 +1318,7 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
        if (tuple->t_infomask & HEAP_XMAX_INVALID)
                return HEAPTUPLE_LIVE;
 
-       if (tuple->t_infomask & HEAP_IS_LOCKED)
+       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
        {
                /*
                 * "Deleting" xact really only locked it, so the tuple is live in any
@@ -1158,40 +1332,96 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
                {
                        if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
                        {
-                               if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple)))
+                               /*
+                                * If it's only locked but neither EXCL_LOCK nor KEYSHR_LOCK
+                                * are set, it cannot possibly be running; otherwise have to
+                                * check.
+                                */
+                               if ((tuple->t_infomask & (HEAP_XMAX_EXCL_LOCK |
+                                                                                 HEAP_XMAX_KEYSHR_LOCK)) &&
+                                       MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple)))
                                        return HEAPTUPLE_LIVE;
+                               SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId);
+
                        }
                        else
                        {
-                               if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+                               if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
                                        return HEAPTUPLE_LIVE;
+                               SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+                                                       InvalidTransactionId);
                        }
-
-                       /*
-                        * We don't really care whether xmax did commit, abort or crash.
-                        * We know that xmax did lock the tuple, but it did not and will
-                        * never actually update it.
-                        */
-                       SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-                                               InvalidTransactionId);
                }
+
+               /*
+                * We don't really care whether xmax did commit, abort or crash.
+                * We know that xmax did lock the tuple, but it did not and will
+                * never actually update it.
+                */
+
                return HEAPTUPLE_LIVE;
        }
 
        if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
        {
-               /* MultiXacts are currently only allowed to lock tuples */
-               Assert(tuple->t_infomask & HEAP_IS_LOCKED);
-               return HEAPTUPLE_LIVE;
+               TransactionId xmax;
+
+               if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple)))
+               {
+                       /* already checked above */
+                       Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
+
+                       xmax = HeapTupleGetUpdateXid(tuple);
+                       if (!TransactionIdIsValid(xmax))
+                               return HEAPTUPLE_LIVE;
+                       if (TransactionIdIsInProgress(xmax))
+                               return HEAPTUPLE_DELETE_IN_PROGRESS;
+                       else if (TransactionIdDidCommit(xmax))
+                               /* there are still lockers around -- can't return DEAD here */
+                               return HEAPTUPLE_RECENTLY_DEAD;
+                       /* updating transaction aborted */
+                       return HEAPTUPLE_LIVE;
+               }
+
+               Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED));
+
+               xmax = HeapTupleGetUpdateXid(tuple);
+               if (!TransactionIdIsValid(xmax))
+                       return HEAPTUPLE_LIVE;
+               /* multi is not running -- updating xact cannot be */
+               Assert(!TransactionIdIsInProgress(xmax));
+               if (TransactionIdDidCommit(xmax))
+               {
+                       if (!TransactionIdPrecedes(xmax, OldestXmin))
+                               return HEAPTUPLE_RECENTLY_DEAD;
+                       else
+                               return HEAPTUPLE_DEAD;
+               }
+               else
+               {
+                       /*
+                        * Not in Progress, Not Committed, so either Aborted or crashed.
+                        */
+                       SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId);
+                       return HEAPTUPLE_LIVE;
+               }
+
+               /*
+                * Deleter committed, but perhaps it was recent enough that some open
+                * transactions could still see the tuple.
+                */
+
+               /* Otherwise, it's dead and removable */
+               return HEAPTUPLE_DEAD;
        }
 
        if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
        {
-               if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+               if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
                        return HEAPTUPLE_DELETE_IN_PROGRESS;
-               else if (TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+               else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
                        SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-                                               HeapTupleHeaderGetXmax(tuple));
+                                               HeapTupleHeaderGetRawXmax(tuple));
                else
                {
                        /*
@@ -1213,7 +1443,7 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
         * Deleter committed, but perhaps it was recent enough that some open
         * transactions could still see the tuple.
         */
-       if (!TransactionIdPrecedes(HeapTupleHeaderGetXmax(tuple), OldestXmin))
+       if (!TransactionIdPrecedes(HeapTupleHeaderGetRawXmax(tuple), OldestXmin))
                return HEAPTUPLE_RECENTLY_DEAD;
 
        /* Otherwise, it's dead and removable */
@@ -1246,11 +1476,22 @@ HeapTupleIsSurelyDead(HeapTupleHeader tuple, TransactionId OldestXmin)
 
        /*
         * If the inserting transaction committed, but any deleting transaction
-        * aborted, the tuple is still alive.  Likewise, if XMAX is a lock rather
-        * than a delete, the tuple is still alive.
+        * aborted, the tuple is still alive.
         */
-       if (tuple->t_infomask &
-               (HEAP_XMAX_INVALID | HEAP_IS_LOCKED | HEAP_XMAX_IS_MULTI))
+       if (tuple->t_infomask & HEAP_XMAX_INVALID)
+               return false;
+
+       /*
+        * If the XMAX is just a lock, the tuple is still alive.
+        */
+       if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+               return false;
+
+       /*
+        * If the Xmax is a MultiXact, it might be dead or alive, but we cannot
+        * know without checking pg_multixact.
+        */
+       if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
                return false;
 
        /* If deleter isn't known to have committed, assume it's still running. */
@@ -1258,7 +1499,7 @@ HeapTupleIsSurelyDead(HeapTupleHeader tuple, TransactionId OldestXmin)
                return false;
 
        /* Deleter committed, so tuple is dead if the XID is old enough. */
-       return TransactionIdPrecedes(HeapTupleHeaderGetXmax(tuple), OldestXmin);
+       return TransactionIdPrecedes(HeapTupleHeaderGetRawXmax(tuple), OldestXmin);
 }
 
 /*
@@ -1375,3 +1616,54 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 
        return false;
 }
+
+/*
+ * Is the tuple really only locked?  That is, is it not updated?
+ *
+ * It's easy to check just infomask bits if the locker is not a multi; but
+ * otherwise we need to verify that the updating transaction has not aborted.
+ *
+ * This function is here because it follows the same time qualification rules
+ * laid out at the top of this file.
+ */
+bool
+HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
+{
+       TransactionId   xmax;
+
+       /* if there's no valid Xmax, then there's obviously no update either */
+       if (tuple->t_infomask & HEAP_XMAX_INVALID)
+               return true;
+
+       if (tuple->t_infomask & HEAP_XMAX_LOCK_ONLY)
+               return true;
+
+       /* invalid xmax means no update */
+       if (!TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
+               return true;
+
+       /*
+        * if HEAP_XMAX_LOCK_ONLY is not set and not a multi, then this
+        * must necessarily have been updated
+        */
+       if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+               return false;
+
+       /* ... but if it's a multi, then perhaps the updating Xid aborted. */
+       xmax = HeapTupleGetUpdateXid(tuple);
+       if (!TransactionIdIsValid(xmax))        /* shouldn't happen .. */
+               return true;
+
+       if (TransactionIdIsCurrentTransactionId(xmax))
+               return false;
+       if (TransactionIdIsInProgress(xmax))
+               return false;
+       if (TransactionIdDidCommit(xmax))
+               return false;
+
+       /*
+        * not current, not in progress, not committed -- must have aborted or
+        * crashed
+        */
+       return true;
+}
index 0fe68bb9e1357d9c9849d39977883da5626a8e37..6a82ea18ed5b45b0b08cad442f5c840a3fc76b77 100644 (file)
@@ -232,6 +232,10 @@ main(int argc, char *argv[])
                   ControlFile.checkPointCopy.oldestXidDB);
        printf(_("Latest checkpoint's oldestActiveXID:  %u\n"),
                   ControlFile.checkPointCopy.oldestActiveXid);
+       printf(_("Latest checkpoint's oldestMultiXact:  %u\n"),
+                  ControlFile.checkPointCopy.oldestMulti);
+       printf(_("Latest checkpoint's oldestMulti's DB: %u\n"),
+                  ControlFile.checkPointCopy.oldestMultiDB);
        printf(_("Time of latest checkpoint:            %s\n"),
                   ckpttime_str);
        printf(_("Min recovery ending location:         %X/%X\n"),
index 8734f2c8688082bc928e89cb9e16c6751555f64c..8e7fe7eb72e011091c345eff5c89da998346b4e5 100644 (file)
@@ -85,10 +85,12 @@ main(int argc, char *argv[])
        TransactionId set_xid = 0;
        Oid                     set_oid = 0;
        MultiXactId set_mxid = 0;
+       MultiXactId set_oldestmxid = 0;
        MultiXactOffset set_mxoff = (MultiXactOffset) -1;
        uint32          minXlogTli = 0;
        XLogSegNo       minXlogSegNo = 0;
        char       *endptr;
+       char       *endptr2;
        char       *DataDir;
        int                     fd;
 
@@ -170,7 +172,15 @@ main(int argc, char *argv[])
 
                        case 'm':
                                set_mxid = strtoul(optarg, &endptr, 0);
-                               if (endptr == optarg || *endptr != '\0')
+                               if (endptr == optarg || *endptr != ',')
+                               {
+                                       fprintf(stderr, _("%s: invalid argument for option -m\n"), progname);
+                                       fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
+                                       exit(1);
+                               }
+
+                               set_oldestmxid = strtoul(endptr + 1, &endptr2, 0);
+                               if (endptr2 == endptr + 1 || *endptr2 != '\0')
                                {
                                        fprintf(stderr, _("%s: invalid argument for option -m\n"), progname);
                                        fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
@@ -181,6 +191,16 @@ main(int argc, char *argv[])
                                        fprintf(stderr, _("%s: multitransaction ID (-m) must not be 0\n"), progname);
                                        exit(1);
                                }
+                               /*
+                                * XXX It'd be nice to have more sanity checks here, e.g. so
+                                * that oldest is not wrapped around w.r.t. nextMulti.
+                                */
+                               if (set_oldestmxid == 0)
+                               {
+                                       fprintf(stderr, _("%s: oldest multitransaction ID (-m) must not be 0\n"),
+                                                       progname);
+                                       exit(1);
+                               }
                                break;
 
                        case 'O':
@@ -307,8 +327,15 @@ main(int argc, char *argv[])
                ControlFile.checkPointCopy.nextOid = set_oid;
 
        if (set_mxid != 0)
+       {
                ControlFile.checkPointCopy.nextMulti = set_mxid;
 
+               ControlFile.checkPointCopy.oldestMulti = set_oldestmxid;
+               if (ControlFile.checkPointCopy.oldestMulti < FirstMultiXactId)
+                       ControlFile.checkPointCopy.oldestMulti += FirstMultiXactId;
+               ControlFile.checkPointCopy.oldestMultiDB = InvalidOid;
+       }
+
        if (set_mxoff != -1)
                ControlFile.checkPointCopy.nextMultiOffset = set_mxoff;
 
@@ -471,6 +498,8 @@ GuessControlValues(void)
        ControlFile.checkPointCopy.nextMultiOffset = 0;
        ControlFile.checkPointCopy.oldestXid = FirstNormalTransactionId;
        ControlFile.checkPointCopy.oldestXidDB = InvalidOid;
+       ControlFile.checkPointCopy.oldestMulti = FirstMultiXactId;
+       ControlFile.checkPointCopy.oldestMultiDB = InvalidOid;
        ControlFile.checkPointCopy.time = (pg_time_t) time(NULL);
        ControlFile.checkPointCopy.oldestActiveXid = InvalidTransactionId;
 
@@ -562,6 +591,10 @@ PrintControlValues(bool guessed)
                   ControlFile.checkPointCopy.oldestXidDB);
        printf(_("Latest checkpoint's oldestActiveXID:  %u\n"),
                   ControlFile.checkPointCopy.oldestActiveXid);
+       printf(_("Latest checkpoint's oldestMultiXid:   %u\n"),
+                  ControlFile.checkPointCopy.oldestMulti);
+       printf(_("Latest checkpoint's oldestMulti's DB: %u\n"),
+                  ControlFile.checkPointCopy.oldestMultiDB);
        printf(_("Maximum data alignment:               %u\n"),
                   ControlFile.maxAlign);
        /* we don't print floatFormat since can't say much useful about it */
@@ -994,7 +1027,7 @@ usage(void)
        printf(_("  -e XIDEPOCH      set next transaction ID epoch\n"));
        printf(_("  -f               force update to be done\n"));
        printf(_("  -l xlogfile      force minimum WAL starting location for new transaction log\n"));
-       printf(_("  -m XID           set next multitransaction ID\n"));
+       printf(_("  -m XID,OLDEST    set next multitransaction ID and oldest value\n"));
        printf(_("  -n               no update, just show extracted control values (for testing)\n"));
        printf(_("  -o OID           set next OID\n"));
        printf(_("  -O OFFSET        set next multitransaction offset\n"));
index c737b3ff2898fadee4bca484a9a80d4a9bda0089..af9e506d2b8681cfa95deafb956120d0043414b7 100644 (file)
 
 typedef struct BulkInsertStateData *BulkInsertState;
 
-typedef enum
+/*
+ * Possible lock modes for a tuple.
+ */
+typedef enum LockTupleMode
 {
-       LockTupleShared,
+       /* SELECT FOR KEY SHARE */
+       LockTupleKeyShare,
+       /* SELECT FOR SHARE */
+       LockTupleShare,
+       /* SELECT FOR NO KEY UPDATE, and UPDATEs that don't modify key columns */
+       LockTupleNoKeyExclusive,
+       /* SELECT FOR UPDATE, UPDATEs that modify key columns, and DELETE */
        LockTupleExclusive
 } LockTupleMode;
 
+#define MaxLockTupleMode       LockTupleExclusive
+
 /*
  * When heap_update, heap_delete, or heap_lock_tuple fail because the target
  * tuple is already outdated, they fill in this struct to provide information
@@ -129,14 +140,16 @@ extern HTSU_Result heap_delete(Relation relation, ItemPointer tid,
 extern HTSU_Result heap_update(Relation relation, ItemPointer otid,
                        HeapTuple newtup,
                        CommandId cid, Snapshot crosscheck, bool wait,
-                       HeapUpdateFailureData *hufd);
+                       HeapUpdateFailureData *hufd, LockTupleMode *lockmode);
 extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple,
                                CommandId cid, LockTupleMode mode, bool nowait,
+                               bool follow_update,
                                Buffer *buffer, HeapUpdateFailureData *hufd);
 extern void heap_inplace_update(Relation relation, HeapTuple tuple);
-extern bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid);
+extern bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
+                                 TransactionId cutoff_multi);
 extern bool heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
-                                               Buffer buf);
+                                               MultiXactId cutoff_multi, Buffer buf);
 
 extern Oid     simple_heap_insert(Relation relation, HeapTuple tup);
 extern void simple_heap_delete(Relation relation, ItemPointer tid);
index 9db69537203ba639c4095469de2d3128c934b713..270924a01ae87d380c9d99cc4b5c590863ff5ba8 100644 (file)
@@ -54,6 +54,7 @@
 #define XLOG_HEAP2_CLEANUP_INFO 0x30
 #define XLOG_HEAP2_VISIBLE             0x40
 #define XLOG_HEAP2_MULTI_INSERT 0x50
+#define XLOG_HEAP2_LOCK_UPDATED 0x60
 
 /*
  * All what we need to find changed tuple
@@ -75,6 +76,8 @@ typedef struct xl_heaptid
 typedef struct xl_heap_delete
 {
        xl_heaptid      target;                 /* deleted tuple id */
+       TransactionId xmax;                     /* xmax of the deleted tuple */
+       uint8           infobits_set;   /* infomask bits */
        bool            all_visible_cleared;    /* PD_ALL_VISIBLE was cleared */
 } xl_heap_delete;
 
@@ -141,7 +144,10 @@ typedef struct xl_multi_insert_tuple
 typedef struct xl_heap_update
 {
        xl_heaptid      target;                 /* deleted tuple id */
+       TransactionId old_xmax;         /* xmax of the old tuple */
+       TransactionId new_xmax;         /* xmax of the new tuple */
        ItemPointerData newtid;         /* new inserted tuple id */
+       uint8           old_infobits_set;       /* infomask bits to set on old tuple */
        bool            all_visible_cleared;    /* PD_ALL_VISIBLE was cleared */
        bool            new_all_visible_cleared;                /* same for the page of newtid */
        /* NEW TUPLE xl_heap_header AND TUPLE DATA FOLLOWS AT END OF STRUCT */
@@ -197,16 +203,32 @@ typedef struct xl_heap_newpage
 
 #define SizeOfHeapNewpage      (offsetof(xl_heap_newpage, blkno) + sizeof(BlockNumber))
 
+/* flags for infobits_set */
+#define XLHL_XMAX_IS_MULTI             0x01
+#define XLHL_XMAX_LOCK_ONLY            0x02
+#define XLHL_XMAX_EXCL_LOCK            0x04
+#define XLHL_XMAX_KEYSHR_LOCK  0x08
+#define XLHL_KEYS_UPDATED              0x10
+
 /* This is what we need to know about lock */
 typedef struct xl_heap_lock
 {
        xl_heaptid      target;                 /* locked tuple id */
        TransactionId locking_xid;      /* might be a MultiXactId not xid */
-       bool            xid_is_mxact;   /* is it? */
-       bool            shared_lock;    /* shared or exclusive row lock? */
+       int8            infobits_set;   /* infomask and infomask2 bits to set */
 } xl_heap_lock;
 
-#define SizeOfHeapLock (offsetof(xl_heap_lock, shared_lock) + sizeof(bool))
+#define SizeOfHeapLock (offsetof(xl_heap_lock, infobits_set) + sizeof(int8))
+
+/* This is what we need to know about locking an updated version of a row */
+typedef struct xl_heap_lock_updated
+{
+       xl_heaptid      target;
+       TransactionId   xmax;
+       uint8           infobits_set;
+} xl_heap_lock_updated;
+
+#define SizeOfHeapLockUpdated  (offsetof(xl_heap_lock_updated, infobits_set) + sizeof(uint8))
 
 /* This is what we need to know about in-place update */
 typedef struct xl_heap_inplace
@@ -223,10 +245,11 @@ typedef struct xl_heap_freeze
        RelFileNode node;
        BlockNumber block;
        TransactionId cutoff_xid;
+       MultiXactId cutoff_multi;
        /* TUPLE OFFSET NUMBERS FOLLOW AT THE END */
 } xl_heap_freeze;
 
-#define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId))
+#define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_multi) + sizeof(MultiXactId))
 
 /* This is what we need to know about setting a visibility map bit */
 typedef struct xl_heap_visible
@@ -254,7 +277,7 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
                           OffsetNumber *nowunused, int nunused,
                           TransactionId latestRemovedXid);
 extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
-                               TransactionId cutoff_xid,
+                               TransactionId cutoff_xid, MultiXactId cutoff_multi,
                                OffsetNumber *offsets, int offcnt);
 extern XLogRecPtr log_heap_visible(RelFileNode rnode, BlockNumber block,
                                 Buffer vm_buffer, TransactionId cutoff_xid);
index 9cd4b88ed45e2ebee9600ee94b75a329d17f9ed0..79e3c50ef18e75ea61ef49c1c6cc512ca9087e72 100644 (file)
@@ -80,7 +80,9 @@ typedef HeapTupleData *HeapTuple;
 extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup);
 extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup);
 extern void HeapTupleHeaderAdjustCmax(HeapTupleHeader tup,
-                                                 CommandId *cmax,
-                                                 bool *iscombo);
+                                                 CommandId *cmax, bool *iscombo);
+
+/* Prototype for HeapTupleHeader accessors in heapam.c */
+extern TransactionId HeapTupleGetUpdateXid(HeapTupleHeader tuple);
 
 #endif   /* HTUP_H */
index aeab45bb97731a7bef2dcbcf3f5a7fa52f4a2394..6a28d8ed74e0cbe79b119afa9164cf813ef3e9fc 100644 (file)
@@ -162,12 +162,16 @@ struct HeapTupleHeaderData
 #define HEAP_HASVARWIDTH               0x0002  /* has variable-width attribute(s) */
 #define HEAP_HASEXTERNAL               0x0004  /* has external stored attribute(s) */
 #define HEAP_HASOID                            0x0008  /* has an object-id field */
-/* bit 0x0010 is available */
+#define HEAP_XMAX_KEYSHR_LOCK  0x0010  /* xmax is a key-shared locker */
 #define HEAP_COMBOCID                  0x0020  /* t_cid is a combo cid */
 #define HEAP_XMAX_EXCL_LOCK            0x0040  /* xmax is exclusive locker */
-#define HEAP_XMAX_SHARED_LOCK  0x0080  /* xmax is shared locker */
-/* if either LOCK bit is set, xmax hasn't deleted the tuple, only locked it */
-#define HEAP_IS_LOCKED (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_SHARED_LOCK)
+#define HEAP_XMAX_LOCK_ONLY            0x0080  /* xmax, if valid, is only a locker */
+
+                                                                               /* xmax is a shared locker */
+#define HEAP_XMAX_SHR_LOCK     (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_KEYSHR_LOCK)
+
+#define HEAP_LOCK_MASK (HEAP_XMAX_SHR_LOCK | HEAP_XMAX_EXCL_LOCK | \
+                                                HEAP_XMAX_KEYSHR_LOCK)
 #define HEAP_XMIN_COMMITTED            0x0100  /* t_xmin committed */
 #define HEAP_XMIN_INVALID              0x0200  /* t_xmin invalid/aborted */
 #define HEAP_XMAX_COMMITTED            0x0400  /* t_xmax committed */
@@ -182,17 +186,42 @@ struct HeapTupleHeaderData
                                                                                 * upgrade support */
 #define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN)
 
-#define HEAP_XACT_MASK                 0xFFE0  /* visibility-related bits */
+#define HEAP_XACT_MASK                 0xFFF0  /* visibility-related bits */
+
+/*
+ * A tuple is only locked (i.e. not updated by its Xmax) if it the
+ * HEAP_XMAX_LOCK_ONLY bit is set.
+ *
+ * See also HeapTupleHeaderIsOnlyLocked, which also checks for a possible
+ * aborted updater transaction.
+ */
+#define HEAP_XMAX_IS_LOCKED_ONLY(infomask) \
+       ((infomask) & HEAP_XMAX_LOCK_ONLY)
+/*
+ * Use these to test whether a particular lock is applied to a tuple
+ */
+#define HEAP_XMAX_IS_SHR_LOCKED(infomask) \
+    (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_SHR_LOCK)
+#define HEAP_XMAX_IS_EXCL_LOCKED(infomask) \
+    (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_EXCL_LOCK)
+#define HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) \
+    (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_KEYSHR_LOCK)
+
+/* turn these all off when Xmax is to change */
+#define HEAP_XMAX_BITS (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | \
+                                               HEAP_XMAX_IS_MULTI | HEAP_LOCK_MASK | HEAP_XMAX_LOCK_ONLY)
 
 /*
  * information stored in t_infomask2:
  */
 #define HEAP_NATTS_MASK                        0x07FF  /* 11 bits for number of attributes */
-/* bits 0x3800 are available */
+/* bits 0x1800 are available */
+#define HEAP_KEYS_UPDATED              0x2000  /* tuple was updated and key cols
+                                                                                * modified, or tuple deleted */
 #define HEAP_HOT_UPDATED               0x4000  /* tuple was HOT-updated */
 #define HEAP_ONLY_TUPLE                        0x8000  /* this is heap-only tuple */
 
-#define HEAP2_XACT_MASK                        0xC000  /* visibility-related bits */
+#define HEAP2_XACT_MASK                        0xE000  /* visibility-related bits */
 
 /*
  * HEAP_TUPLE_HAS_MATCH is a temporary flag used during hash joins.  It is
@@ -219,7 +248,24 @@ struct HeapTupleHeaderData
        (tup)->t_choice.t_heap.t_xmin = (xid) \
 )
 
-#define HeapTupleHeaderGetXmax(tup) \
+/*
+ * HeapTupleHeaderGetRawXmax gets you the raw Xmax field.  To find out the Xid
+ * that updated a tuple, you might need to resolve the MultiXactId if certain
+ * bits are set.  HeapTupleHeaderGetUpdateXid checks those bits and takes care
+ * to resolve the MultiXactId if necessary.  This might involve multixact I/O,
+ * so it should only be used if absolutely necessary.
+ */
+#define HeapTupleHeaderGetUpdateXid(tup) \
+( \
+       (!((tup)->t_infomask & HEAP_XMAX_INVALID) && \
+        ((tup)->t_infomask & HEAP_XMAX_IS_MULTI) && \
+        !((tup)->t_infomask & HEAP_XMAX_LOCK_ONLY)) ? \
+               HeapTupleGetUpdateXid(tup) \
+       : \
+               HeapTupleHeaderGetRawXmax(tup) \
+)
+
+#define HeapTupleHeaderGetRawXmax(tup) \
 ( \
        (tup)->t_choice.t_heap.t_xmax \
 )
index b5486bec09790dce6963bd6c9e2331276367859c..b08bb1f49a687d3ad27126979572591dc319555a 100644 (file)
 
 #include "access/xlog.h"
 
+
+/*
+ * The first two MultiXactId values are reserved to store the truncation Xid
+ * and epoch of the first segment, so we start assigning multixact values from
+ * 2.
+ */
 #define InvalidMultiXactId     ((MultiXactId) 0)
 #define FirstMultiXactId       ((MultiXactId) 1)
+#define MaxMultiXactId         ((MultiXactId) 0xFFFFFFFF)
 
 #define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId)
 
 #define NUM_MXACTOFFSET_BUFFERS                8
 #define NUM_MXACTMEMBER_BUFFERS                16
 
+/*
+ * Possible multixact lock modes ("status").  The first four modes are for
+ * tuple locks (FOR KEY SHARE, FOR SHARE, FOR NO KEY UPDATE, FOR UPDATE); the
+ * next two are used for update and delete modes.
+ */
+typedef enum
+{
+       MultiXactStatusForKeyShare = 0x00,
+       MultiXactStatusForShare = 0x01,
+       MultiXactStatusForNoKeyUpdate = 0x02,
+       MultiXactStatusForUpdate = 0x03,
+       /* an update that doesn't touch "key" columns */
+       MultiXactStatusNoKeyUpdate = 0x04,
+       /* other updates, and delete */
+       MultiXactStatusUpdate = 0x05
+} MultiXactStatus;
+
+#define MaxMultiXactStatus MultiXactStatusUpdate
+
+
+typedef struct MultiXactMember
+{
+       TransactionId   xid;
+       MultiXactStatus status;
+} MultiXactMember;
+
+
 /* ----------------
  *             multixact-related XLOG entries
  * ----------------
@@ -35,21 +69,24 @@ typedef struct xl_multixact_create
 {
        MultiXactId mid;                        /* new MultiXact's ID */
        MultiXactOffset moff;           /* its starting offset in members file */
-       int32           nxids;                  /* number of member XIDs */
-       TransactionId xids[1];          /* VARIABLE LENGTH ARRAY */
+       int32           nmembers;               /* number of member XIDs */
+       MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
 } xl_multixact_create;
 
-#define MinSizeOfMultiXactCreate offsetof(xl_multixact_create, xids)
+#define SizeOfMultiXactCreate (offsetof(xl_multixact_create, members))
 
 
-extern MultiXactId MultiXactIdCreate(TransactionId xid1, TransactionId xid2);
-extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid);
+extern MultiXactId MultiXactIdCreate(TransactionId xid1,
+                                 MultiXactStatus status1, TransactionId xid2,
+                                 MultiXactStatus status2);
+extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid,
+                                 MultiXactStatus status);
+extern MultiXactId ReadNextMultiXactId(void);
 extern bool MultiXactIdIsRunning(MultiXactId multi);
-extern bool MultiXactIdIsCurrent(MultiXactId multi);
-extern void MultiXactIdWait(MultiXactId multi);
-extern bool ConditionalMultiXactIdWait(MultiXactId multi);
 extern void MultiXactIdSetOldestMember(void);
-extern int     GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids);
+extern int     GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **xids,
+                                         bool allow_old);
+extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2);
 
 extern void AtEOXact_MultiXact(void);
 extern void AtPrepare_MultiXact(void);
@@ -60,14 +97,21 @@ extern void MultiXactShmemInit(void);
 extern void BootStrapMultiXact(void);
 extern void StartupMultiXact(void);
 extern void ShutdownMultiXact(void);
+extern void SetMultiXactIdLimit(MultiXactId oldest_datminmxid,
+                                       Oid oldest_datoid);
 extern void MultiXactGetCheckptMulti(bool is_shutdown,
                                                 MultiXactId *nextMulti,
-                                                MultiXactOffset *nextMultiOffset);
+                                                MultiXactOffset *nextMultiOffset,
+                                                MultiXactId *oldestMulti,
+                                                Oid *oldestMultiDB);
 extern void CheckPointMultiXact(void);
+extern MultiXactId GetOldestMultiXactId(void);
+extern void TruncateMultiXact(MultiXactId cutoff_multi);
 extern void MultiXactSetNextMXact(MultiXactId nextMulti,
                                          MultiXactOffset nextMultiOffset);
 extern void MultiXactAdvanceNextMXact(MultiXactId minMulti,
                                                  MultiXactOffset minMultiOffset);
+extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB);
 
 extern void multixact_twophase_recover(TransactionId xid, uint16 info,
                                                   void *recdata, uint32 len);
@@ -78,5 +122,7 @@ extern void multixact_twophase_postabort(TransactionId xid, uint16 info,
 
 extern void multixact_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void multixact_desc(StringInfo buf, uint8 xl_info, char *rec);
+extern char *mxid_to_string(MultiXactId multi, int nmembers,
+                          MultiXactMember *members);
 
 #endif   /* MULTIXACT_H */
index 13b991a8b1a6d78dd522fad4dc9ecc70989342ef..f82d1f5734737feae6cd68b60d38f54883d6a41d 100644 (file)
@@ -21,7 +21,7 @@ typedef struct RewriteStateData *RewriteState;
 
 extern RewriteState begin_heap_rewrite(Relation NewHeap,
                                   TransactionId OldestXmin, TransactionId FreezeXid,
-                                  bool use_wal);
+                                  MultiXactId MultiXactFrzLimit, bool use_wal);
 extern void end_heap_rewrite(RewriteState state);
 extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple,
                                   HeapTuple newTuple);
index a676793566d54e4eefaba16098613928831b946b..4b8fa0175b3866459b5e1317d27a8270e4765e61 100644 (file)
@@ -53,6 +53,6 @@
  */
 
 /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     201301211
+#define CATALOG_VERSION_NO     201301231
 
 #endif
index fcc293899ab4ec85ddcb5ff1deaca6464d18c6fb..820552f013412142b70dbecf0fe44ce2bcdc5456 100644 (file)
@@ -67,6 +67,8 @@ CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83) BKI_SCHEMA_MACRO
        bool            relhastriggers; /* has (or has had) any TRIGGERs */
        bool            relhassubclass; /* has (or has had) derived classes */
        TransactionId relfrozenxid; /* all Xids < this are frozen in this rel */
+       TransactionId relminmxid;       /* all multixacts in this rel are >= this.
+                                                                * this is really a MultiXactId */
 
 #ifdef CATALOG_VARLEN                  /* variable-length fields start here */
        /* NOTE: These fields are not present in a relcache entry's rd_rel field. */
@@ -77,7 +79,7 @@ CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83) BKI_SCHEMA_MACRO
 
 /* Size of fixed part of pg_class tuples, not counting var-length fields */
 #define CLASS_TUPLE_SIZE \
-        (offsetof(FormData_pg_class,relfrozenxid) + sizeof(TransactionId))
+        (offsetof(FormData_pg_class,relminmxid) + sizeof(TransactionId))
 
 /* ----------------
  *             Form_pg_class corresponds to a pointer to a tuple with
@@ -91,7 +93,7 @@ typedef FormData_pg_class *Form_pg_class;
  * ----------------
  */
 
-#define Natts_pg_class                                 27
+#define Natts_pg_class                                 28
 #define Anum_pg_class_relname                  1
 #define Anum_pg_class_relnamespace             2
 #define Anum_pg_class_reltype                  3
@@ -117,8 +119,9 @@ typedef FormData_pg_class *Form_pg_class;
 #define Anum_pg_class_relhastriggers   23
 #define Anum_pg_class_relhassubclass   24
 #define Anum_pg_class_relfrozenxid             25
-#define Anum_pg_class_relacl                   26
-#define Anum_pg_class_reloptions               27
+#define Anum_pg_class_relminmxid               26
+#define Anum_pg_class_relacl                   27
+#define Anum_pg_class_reloptions               28
 
 /* ----------------
  *             initial contents of pg_class
@@ -129,14 +132,17 @@ typedef FormData_pg_class *Form_pg_class;
  * ----------------
  */
 
-/* Note: "3" in the relfrozenxid column stands for FirstNormalTransactionId */
-DATA(insert OID = 1247 (  pg_type              PGNSP 71 0 PGUID 0 0 0 0 0 0 0 0 f f p r 30 0 t f f f f 3 _null_ _null_ ));
+/*
+ * Note: "3" in the relfrozenxid column stands for FirstNormalTransactionId;
+ * similarly, "1" in relminmxid stands for FirstMultiXactId
+ */
+DATA(insert OID = 1247 (  pg_type              PGNSP 71 0 PGUID 0 0 0 0 0 0 0 0 f f p r 30 0 t f f f f 3 1 _null_ _null_ ));
 DESCR("");
-DATA(insert OID = 1249 (  pg_attribute PGNSP 75 0 PGUID 0 0 0 0 0 0 0 0 f f p r 21 0 f f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1249 (  pg_attribute PGNSP 75 0 PGUID 0 0 0 0 0 0 0 0 f f p r 21 0 f f f f f 3 _null_ _null_ ));
 DESCR("");
-DATA(insert OID = 1255 (  pg_proc              PGNSP 81 0 PGUID 0 0 0 0 0 0 0 0 f f p r 27 0 t f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1255 (  pg_proc              PGNSP 81 0 PGUID 0 0 0 0 0 0 0 0 f f p r 27 0 t f f f f 3 _null_ _null_ ));
 DESCR("");
-DATA(insert OID = 1259 (  pg_class             PGNSP 83 0 PGUID 0 0 0 0 0 0 0 0 f f p r 27 0 t f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1259 (  pg_class             PGNSP 83 0 PGUID 0 0 0 0 0 0 0 0 f f p r 28 0 t f f f f 3 1 _null_ _null_ ));
 DESCR("");
 
 
index ead3a6e4bad7e78a742ed71e3d6820bf3a14540d..e4a9abe7bc55d21b83a08143c0b5caf3e828b7f5 100644 (file)
@@ -21,7 +21,7 @@
 
 
 /* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION     932
+#define PG_CONTROL_VERSION     933
 
 /*
  * Body of CheckPoint XLOG records.  This is declared here because we keep
@@ -41,6 +41,8 @@ typedef struct CheckPoint
        MultiXactOffset nextMultiOffset;        /* next free MultiXact offset */
        TransactionId oldestXid;        /* cluster-wide minimum datfrozenxid */
        Oid                     oldestXidDB;    /* database with minimum datfrozenxid */
+       MultiXactId     oldestMulti;    /* cluster-wide minimum datminmxid */
+       Oid                     oldestMultiDB;  /* database with minimum datminmxid */
        pg_time_t       time;                   /* time stamp of checkpoint */
 
        /*
index 4010959b029d6f4824db948a2ca31b2d52ad73f5..baeddcd12a103a6b0b4852c139b52ec44e79098f 100644 (file)
@@ -41,6 +41,7 @@ CATALOG(pg_database,1262) BKI_SHARED_RELATION BKI_ROWTYPE_OID(1248) BKI_SCHEMA_M
        int32           datconnlimit;   /* max connections allowed (-1=no limit) */
        Oid                     datlastsysoid;  /* highest OID to consider a system OID */
        TransactionId datfrozenxid; /* all Xids < this are frozen in this DB */
+       TransactionId datminmxid;       /* all multixacts in the DB are >= this */
        Oid                     dattablespace;  /* default table space for this DB */
 
 #ifdef CATALOG_VARLEN                  /* variable-length fields start here */
@@ -59,7 +60,7 @@ typedef FormData_pg_database *Form_pg_database;
  *             compiler constants for pg_database
  * ----------------
  */
-#define Natts_pg_database                              12
+#define Natts_pg_database                              13
 #define Anum_pg_database_datname               1
 #define Anum_pg_database_datdba                        2
 #define Anum_pg_database_encoding              3
@@ -70,10 +71,11 @@ typedef FormData_pg_database *Form_pg_database;
 #define Anum_pg_database_datconnlimit  8
 #define Anum_pg_database_datlastsysoid 9
 #define Anum_pg_database_datfrozenxid  10
-#define Anum_pg_database_dattablespace 11
-#define Anum_pg_database_datacl                        12
+#define Anum_pg_database_datminmxid            11
+#define Anum_pg_database_dattablespace 12
+#define Anum_pg_database_datacl                        13
 
-DATA(insert OID = 1 (  template1 PGUID ENCODING "LC_COLLATE" "LC_CTYPE" t t -1 0 0 1663 _null_));
+DATA(insert OID = 1 (  template1 PGUID ENCODING "LC_COLLATE" "LC_CTYPE" t t -1 0 0 1 1663 _null_));
 SHDESCR("default template for new databases");
 #define TemplateDbOid                  1
 
index 010605d774c7a022422a4e5cdab8ef40fc075f24..028e1684ff0c3c7461b893d9d3723a6a8eca7607 100644 (file)
@@ -2909,6 +2909,8 @@ DATA(insert OID = 1371 (  pg_lock_status   PGNSP PGUID 12 1 1000 0 0 f f f f t t
 DESCR("view system lock information");
 DATA(insert OID = 1065 (  pg_prepared_xact PGNSP PGUID 12 1 1000 0 0 f f f f t t v 0 0 2249 "" "{28,25,1184,26,26}" "{o,o,o,o,o}" "{transaction,gid,prepared,ownerid,dbid}" _null_ pg_prepared_xact _null_ _null_ _null_ ));
 DESCR("view two-phase transactions");
+DATA(insert OID = 3819 (  pg_get_multixact_members PGNSP PGUID 12 1 1000 0 0 f f f f t t v 1 0 2249 "28" "{28,28,25}" "{i,o,o}" "{multixid,xid,mode}" _null_ pg_get_multixact_members _null_ _null_ _null_ ));
+DESCR("view members of a multixactid");
 
 DATA(insert OID = 3537 (  pg_describe_object           PGNSP PGUID 12 1 0 0 0 f f f f t f s 3 0 25 "26 26 23" _null_ _null_ _null_ _null_ pg_describe_object _null_ _null_ _null_ ));
 DESCR("get identification of SQL object");
index 532c31c11b43f01b29118b9cca5b65780f2c5eb7..73c701fe53eed0b47346eacf82d1eefd05b868e8 100644 (file)
@@ -30,6 +30,7 @@ extern void finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
                                 bool is_system_catalog,
                                 bool swap_toast_by_content,
                                 bool check_constraints,
-                                TransactionId frozenXid);
+                                TransactionId frozenXid,
+                                MultiXactId frozenMulti);
 
 #endif   /* CLUSTER_H */
index f70442af4a21f6f5c5362f8a772c9025aa9f8600..d8dd8b04ed98caa670531e08efef09afd5bbd371 100644 (file)
@@ -153,12 +153,14 @@ extern void vac_update_relstats(Relation relation,
                                        double num_tuples,
                                        BlockNumber num_all_visible_pages,
                                        bool hasindex,
-                                       TransactionId frozenxid);
+                                       TransactionId frozenxid,
+                                       MultiXactId minmulti);
 extern void vacuum_set_xid_limits(int freeze_min_age, int freeze_table_age,
                                          bool sharedRel,
                                          TransactionId *oldestXmin,
                                          TransactionId *freezeLimit,
-                                         TransactionId *freezeTableLimit);
+                                         TransactionId *freezeTableLimit,
+                                         MultiXactId *multiXactFrzLimit);
 extern void vac_update_datfrozenxid(void);
 extern void vacuum_delay_point(void);
 
index 50468938666c701c991e191ac92551b01414bddb..b1213a0635013876e546725536d234e6cd478d3f 100644 (file)
@@ -193,7 +193,7 @@ extern void ExecConstraints(ResultRelInfo *resultRelInfo,
 extern ExecRowMark *ExecFindRowMark(EState *estate, Index rti);
 extern ExecAuxRowMark *ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist);
 extern TupleTableSlot *EvalPlanQual(EState *estate, EPQState *epqstate,
-                        Relation relation, Index rti,
+                        Relation relation, Index rti, int lockmode,
                         ItemPointer tid, TransactionId priorXmax);
 extern HeapTuple EvalPlanQualFetch(EState *estate, Relation relation,
                                  int lockmode, ItemPointer tid, TransactionId priorXmax);
index b23989e19ff899712d1f5ddd3469524a3b6e2e80..76e8cdb1ad8d6ccb540369aeecae78c956c3b85b 100644 (file)
@@ -403,9 +403,9 @@ typedef struct EState
 
 /*
  * ExecRowMark -
- *        runtime representation of FOR UPDATE/SHARE clauses
+ *        runtime representation of FOR [KEY] UPDATE/SHARE clauses
  *
- * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we should have an
+ * When doing UPDATE, DELETE, or SELECT FOR [KEY] UPDATE/SHARE, we should have an
  * ExecRowMark for each non-target relation in the query (except inheritance
  * parent RTEs, which can be ignored at runtime).  See PlanRowMark for details
  * about most of the fields.  In addition to fields directly derived from
@@ -426,7 +426,7 @@ typedef struct ExecRowMark
 
 /*
  * ExecAuxRowMark -
- *        additional runtime representation of FOR UPDATE/SHARE clauses
+ *        additional runtime representation of FOR [KEY] UPDATE/SHARE clauses
  *
  * Each LockRows and ModifyTable node keeps a list of the rowmarks it needs to
  * deal with.  In addition to a pointer to the related entry in es_rowMarks,
@@ -1824,7 +1824,7 @@ typedef struct SetOpState
 /* ----------------
  *      LockRowsState information
  *
- *             LockRows nodes are used to enforce FOR UPDATE/FOR SHARE locking.
+ *             LockRows nodes are used to enforce FOR [KEY] UPDATE/SHARE locking.
  * ----------------
  */
 typedef struct LockRowsState
index 56cf592e0cef3ad19ac10dcd0218f9668ce16106..d8678e5b3fd7170cb972b64958b004fd29246d8f 100644 (file)
@@ -74,7 +74,7 @@ typedef uint32 AclMode;                       /* a bitmask of privilege bits */
 #define ACL_CONNECT            (1<<11) /* for databases */
 #define N_ACL_RIGHTS   12              /* 1 plus the last 1<<x */
 #define ACL_NO_RIGHTS  0
-/* Currently, SELECT ... FOR UPDATE/FOR SHARE requires UPDATE privileges */
+/* Currently, SELECT ... FOR [KEY] UPDATE/SHARE requires UPDATE privileges */
 #define ACL_SELECT_FOR_UPDATE  ACL_UPDATE
 
 
@@ -119,7 +119,7 @@ typedef struct Query
        bool            hasDistinctOn;  /* distinctClause is from DISTINCT ON */
        bool            hasRecursive;   /* WITH RECURSIVE was specified */
        bool            hasModifyingCTE;        /* has INSERT/UPDATE/DELETE in WITH */
-       bool            hasForUpdate;   /* FOR UPDATE or FOR SHARE was specified */
+       bool            hasForUpdate;   /* FOR [KEY] UPDATE/SHARE was specified */
 
        List       *cteList;            /* WITH list (of CommonTableExpr's) */
 
@@ -572,18 +572,28 @@ typedef struct DefElem
 } DefElem;
 
 /*
- * LockingClause - raw representation of FOR UPDATE/SHARE options
+ * LockingClause - raw representation of FOR [NO KEY] UPDATE/[KEY] SHARE
+ *             options
  *
  * Note: lockedRels == NIL means "all relations in query".     Otherwise it
  * is a list of RangeVar nodes.  (We use RangeVar mainly because it carries
  * a location field --- currently, parse analysis insists on unqualified
  * names in LockingClause.)
  */
+typedef enum LockClauseStrength
+{
+       /* order is important -- see applyLockingClause */
+       LCS_FORKEYSHARE,
+       LCS_FORSHARE,
+       LCS_FORNOKEYUPDATE,
+       LCS_FORUPDATE
+} LockClauseStrength;
+
 typedef struct LockingClause
 {
        NodeTag         type;
-       List       *lockedRels;         /* FOR UPDATE or FOR SHARE relations */
-       bool            forUpdate;              /* true = FOR UPDATE, false = FOR SHARE */
+       List       *lockedRels;         /* FOR [KEY] UPDATE/SHARE relations */
+       LockClauseStrength strength;
        bool            noWait;                 /* NOWAIT option */
 } LockingClause;
 
@@ -865,21 +875,21 @@ typedef struct WindowClause
 
 /*
  * RowMarkClause -
- *        parser output representation of FOR UPDATE/SHARE clauses
+ *        parser output representation of FOR [KEY] UPDATE/SHARE clauses
  *
  * Query.rowMarks contains a separate RowMarkClause node for each relation
- * identified as a FOR UPDATE/SHARE target.  If FOR UPDATE/SHARE is applied
- * to a subquery, we generate RowMarkClauses for all normal and subquery rels
- * in the subquery, but they are marked pushedDown = true to distinguish them
- * from clauses that were explicitly written at this query level.  Also,
- * Query.hasForUpdate tells whether there were explicit FOR UPDATE/SHARE
- * clauses in the current query level.
+ * identified as a FOR [KEY] UPDATE/SHARE target.  If one of these clauses
+ * is applied to a subquery, we generate RowMarkClauses for all normal and
+ * subquery rels in the subquery, but they are marked pushedDown = true to
+ * distinguish them from clauses that were explicitly written at this query
+ * level.  Also, Query.hasForUpdate tells whether there were explicit FOR
+ * UPDATE/SHARE/KEY SHARE clauses in the current query level.
  */
 typedef struct RowMarkClause
 {
        NodeTag         type;
        Index           rti;                    /* range table index of target relation */
-       bool            forUpdate;              /* true = FOR UPDATE, false = FOR SHARE */
+       LockClauseStrength strength;
        bool            noWait;                 /* NOWAIT option */
        bool            pushedDown;             /* pushed down from higher query level? */
 } RowMarkClause;
index 41c5e920347390eabde8914e5f97491a4b40305c..0b8b1076bbffbc49ca214bc4d14f59ac90fd3701 100644 (file)
@@ -752,7 +752,7 @@ typedef struct Limit
  * RowMarkType -
  *       enums for types of row-marking operations
  *
- * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we have to uniquely
+ * When doing UPDATE, DELETE, or SELECT FOR [KEY] UPDATE/SHARE, we have to uniquely
  * identify all the source rows, not only those from the target relations, so
  * that we can perform EvalPlanQual rechecking at need.  For plain tables we
  * can just fetch the TID, the same as for a target relation.  Otherwise (for
@@ -763,20 +763,22 @@ typedef struct Limit
 typedef enum RowMarkType
 {
        ROW_MARK_EXCLUSIVE,                     /* obtain exclusive tuple lock */
+       ROW_MARK_NOKEYEXCLUSIVE,        /* obtain no-key exclusive tuple lock */
        ROW_MARK_SHARE,                         /* obtain shared tuple lock */
+       ROW_MARK_KEYSHARE,                      /* obtain keyshare tuple lock */
        ROW_MARK_REFERENCE,                     /* just fetch the TID */
        ROW_MARK_COPY                           /* physically copy the row value */
 } RowMarkType;
 
-#define RowMarkRequiresRowShareLock(marktype)  ((marktype) <= ROW_MARK_SHARE)
+#define RowMarkRequiresRowShareLock(marktype)  ((marktype) <= ROW_MARK_KEYSHARE)
 
 /*
  * PlanRowMark -
- *        plan-time representation of FOR UPDATE/SHARE clauses
+ *        plan-time representation of FOR [KEY] UPDATE/SHARE clauses
  *
- * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we create a separate
+ * When doing UPDATE, DELETE, or SELECT FOR [KEY] UPDATE/SHARE, we create a separate
  * PlanRowMark node for each non-target relation in the query. Relations that
- * are not specified as FOR UPDATE/SHARE are marked ROW_MARK_REFERENCE (if
+ * are not specified as FOR [KEY] UPDATE/SHARE are marked ROW_MARK_REFERENCE (if
  * real tables) or ROW_MARK_COPY (if not).
  *
  * Initially all PlanRowMarks have rti == prti and isParent == false.
index fc45153f36adc7d74ef7f53ed49304b676235674..2f988d402190762145ac1e059c88a447ce593677 100644 (file)
@@ -38,6 +38,6 @@ extern bool analyze_requires_snapshot(Node *parseTree);
 
 extern void CheckSelectLocking(Query *qry);
 extern void applyLockingClause(Query *qry, Index rtindex,
-                                  bool forUpdate, bool noWait, bool pushedDown);
+                                  LockClauseStrength strength, bool noWait, bool pushedDown);
 
 #endif   /* ANALYZE_H */
index b6e922f3582aae68f5ba63a3a59217c2a4e8e8cb..8ff107a7b3f1741b7dcc14279ae9964f5abd6db0 100644 (file)
@@ -456,6 +456,13 @@ typedef Datum *DatumPtr;
 
 #define TransactionIdGetDatum(X) ((Datum) SET_4_BYTES((X)))
 
+/*
+ * MultiXactIdGetDatum
+ *             Returns datum representation for a multixact identifier.
+ */
+
+#define MultiXactIdGetDatum(X) ((Datum) SET_4_BYTES((X)))
+
 /*
  * DatumGetCommandId
  *             Returns command identifier value of a datum.
index c8974c9ac563fabc7878a2a1e857de0f007e6440..f10c8f194fa0bff6822969a96651a31a43c21e0c 100644 (file)
@@ -478,6 +478,7 @@ typedef enum
 extern void InitLocks(void);
 extern LockMethod GetLocksMethodTable(const LOCK *lock);
 extern uint32 LockTagHashCode(const LOCKTAG *locktag);
+extern bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2);
 extern LockAcquireResult LockAcquire(const LOCKTAG *locktag,
                        LOCKMODE lockmode,
                        bool sessionLock,
index 61d6aef2ede4b43315d10e423b50e9a394d2f997..ad4d68cd50a0bc8932a97f8ce9cbe464d78e308f 100644 (file)
@@ -1134,6 +1134,9 @@ extern Datum ginarrayconsistent(PG_FUNCTION_ARGS);
 /* access/transam/twophase.c */
 extern Datum pg_prepared_xact(PG_FUNCTION_ARGS);
 
+/* access/transam/multixact.c */
+extern Datum pg_get_multixact_members(PG_FUNCTION_ARGS);
+
 /* catalogs/dependency.c */
 extern Datum pg_describe_object(PG_FUNCTION_ARGS);
 
index bde5f1738e37887218512e8e869ac8875f2c9a98..c342eaa66f2fffc729590dc96dfa63534bcec7a9 100644 (file)
@@ -114,6 +114,7 @@ typedef struct RelationData
        Oid                     rd_id;                  /* relation's object id */
        List       *rd_indexlist;       /* list of OIDs of indexes on relation */
        Bitmapset  *rd_indexattr;       /* identifies columns used in indexes */
+       Bitmapset  *rd_keyattr;         /* cols that can be ref'd by foreign keys */
        Oid                     rd_oidindex;    /* OID of unique index on OID, if any */
        LockInfoData rd_lockInfo;       /* lock mgr's info for locking relation */
        RuleLock   *rd_rules;           /* rewrite rules */
index 444fad34601a550b6c32d9cc3bfae780dff9bd14..1ec2683eacb3cc157774c09b86ac0dde0f8cbdc2 100644 (file)
@@ -41,7 +41,7 @@ extern List *RelationGetIndexList(Relation relation);
 extern Oid     RelationGetOidIndex(Relation relation);
 extern List *RelationGetIndexExpressions(Relation relation);
 extern List *RelationGetIndexPredicate(Relation relation);
-extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation);
+extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs);
 extern void RelationGetExclusionInfo(Relation indexRelation,
                                                 Oid **operators,
                                                 Oid **procs,
@@ -77,7 +77,7 @@ extern Relation RelationBuildLocalRelation(const char *relname,
  * Routine to manage assignment of new relfilenode to a relation
  */
 extern void RelationSetNewRelfilenode(Relation relation,
-                                                 TransactionId freezeXid);
+                                                 TransactionId freezeXid, MultiXactId minmulti);
 
 /*
  * Routines for flushing/rebuilding relcache entries in various scenarios
index 72a8ea42e5f67df3850a6425bba20bb107bc499e..465231c758459f950e491bd1300b8cfc23180a62 100644 (file)
@@ -88,5 +88,6 @@ extern bool HeapTupleIsSurelyDead(HeapTupleHeader tuple,
 
 extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer,
                                         uint16 infomask, TransactionId xid);
+extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple);
 
 #endif   /* TQUAL_H */
diff --git a/src/test/isolation/expected/aborted-keyrevoke.out b/src/test/isolation/expected/aborted-keyrevoke.out
new file mode 100644 (file)
index 0000000..8850614
--- /dev/null
@@ -0,0 +1,276 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1s s1u s1r s1l s1c s2l s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s1r s1l s2l s1c s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s1r s1l s2l s2c s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s1r s2l s1l s1c s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s1r s2l s1l s2c s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s1r s2l s2c s1l s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s2l s1r s1l s1c s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s2l s1r s1l s2c s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s2l s1r s2c s1l s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s2l s2c s1r s1l s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s1r s1l s1c s2c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s1r s1l s2c s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s1r s2c s1l s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s2c s1r s1l s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1s s2l s2c s1u s1r s1l s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s2l s1s s1u s1r s1l s1c s2c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1s s1u s1r s1l s2c s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1s s1u s1r s2c s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1s s1u s2c s1r s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s2l s1s s2c s1u s1r s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s2c: COMMIT;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s2l s2c s1s s1u s1r s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/aborted-keyrevoke_2.out b/src/test/isolation/expected/aborted-keyrevoke_2.out
new file mode 100644 (file)
index 0000000..85f6ccb
--- /dev/null
@@ -0,0 +1,278 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1s s1u s1r s1l s1c s2l s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s1r s1l s2l s1c s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s1r s1l s2l s2c s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s1r s2l s1l s1c s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s1r s2l s1l s2c s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s1r s2l s2c s1l s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s2l s1r s1l s1c s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s2l s1r s1l s2c s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s2l s1r s2c s1l s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s2l s2c s1r s1l s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s1r s1l s1c s2c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s1r s1l s2c s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s1r s2c s1l s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s2c s1r s1l s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1s s2l s2c s1u s1r s1l s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s2l s1s s1u s1r s1l s1c s2c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1s s1u s1r s1l s2c s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1s s1u s1r s2c s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1s s1u s2c s1r s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s2l s1s s2c s1u s1r s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s2c: COMMIT;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s2l s2c s1s s1u s1r s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/delete-abort-savept-2.out b/src/test/isolation/expected/delete-abort-savept-2.out
new file mode 100644 (file)
index 0000000..f66a90c
--- /dev/null
@@ -0,0 +1,76 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1l s1svp s1d s1r s2l s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: SELECT * FROM foo FOR NO KEY UPDATE;
+key            value          
+
+1              1              
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1c: COMMIT;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s1d s2l s1r s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: SELECT * FROM foo FOR NO KEY UPDATE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s1d s1r s2l2 s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: SELECT * FROM foo FOR NO KEY UPDATE;
+key            value          
+
+1              1              
+step s1r: ROLLBACK TO f;
+step s2l2: SELECT * FROM foo FOR NO KEY UPDATE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s1d s2l2 s1r s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: SELECT * FROM foo FOR NO KEY UPDATE;
+key            value          
+
+1              1              
+step s2l2: SELECT * FROM foo FOR NO KEY UPDATE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l2: <... completed>
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
diff --git a/src/test/isolation/expected/delete-abort-savept.out b/src/test/isolation/expected/delete-abort-savept.out
new file mode 100644 (file)
index 0000000..3420cf4
--- /dev/null
@@ -0,0 +1,243 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1l s1svp s1d s1r s1c s2l s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s1d s1r s2l s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1c: COMMIT;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s1d s1r s2l s2c s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1l s1svp s1d s2l s1r s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s1d s2l s1r s2c s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1r: ROLLBACK TO f;
+invalid permutation detected
+
+starting permutation: s1l s1svp s1d s2l s2c s1r s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1l s1svp s2l s1d s1r s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s2l s1d s1r s2c s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+invalid permutation detected
+
+starting permutation: s1l s1svp s2l s1d s2c s1r s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1d: DELETE FROM foo;
+invalid permutation detected
+
+starting permutation: s1l s1svp s2l s2c s1d s1r s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1l s2l s1svp s1d s1r s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s2l s1svp s1d s1r s2c s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+invalid permutation detected
+
+starting permutation: s1l s2l s1svp s1d s2c s1r s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+invalid permutation detected
+
+starting permutation: s1l s2l s1svp s2c s1d s1r s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1svp: SAVEPOINT f;
+invalid permutation detected
+
+starting permutation: s1l s2l s2c s1svp s1d s1r s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1l s1svp s1d s1r s1c s2c
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1l s1svp s1d s1r s2c s1c
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1l s1svp s1d s2c s1r s1c
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1l s1svp s2c s1d s1r s1c
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1l s2c s1svp s1d s1r s1c
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s2c: COMMIT;
+step s1l: <... completed>
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
+
+starting permutation: s2l s2c s1l s1svp s1d s1r s1c
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
index 24ed72d427cded5c8834c9b14b8af95cd15aee5f..0916f7f3d28b57a442c4909d59f551715ea6ed0d 100644 (file)
@@ -7,9 +7,8 @@ step upd: UPDATE foo SET b = 'Hello World';
 
 starting permutation: ins upd com
 step ins: INSERT INTO bar VALUES (42);
-step upd: UPDATE foo SET b = 'Hello World'; <waiting ...>
+step upd: UPDATE foo SET b = 'Hello World';
 step com: COMMIT;
-step upd: <... completed>
 
 starting permutation: upd ins com
 step upd: UPDATE foo SET b = 'Hello World';
index 36813f11f51753584b5c7603586af884a364a011..69eac88c2b7c05ff502135c89fcf76a44386ebc2 100644 (file)
@@ -11,57 +11,151 @@ step s2c: COMMIT;
 starting permutation: s1i s1u s2i s1c s2u s2c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s1u: UPDATE parent SET aux = 'bar';
-step s2i: INSERT INTO child VALUES (2, 1); <waiting ...>
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1c: COMMIT;
+step s2u: UPDATE parent SET aux = 'baz';
+step s2c: COMMIT;
+
+starting permutation: s1i s1u s2i s2u s1c s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1c: COMMIT;
+step s2u: <... completed>
+step s2c: COMMIT;
+
+starting permutation: s1i s1u s2i s2u s2c s1c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1i s2i s1u s1c s2u s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1u: UPDATE parent SET aux = 'bar';
 step s1c: COMMIT;
-step s2i: <... completed>
 step s2u: UPDATE parent SET aux = 'baz';
 step s2c: COMMIT;
 
 starting permutation: s1i s2i s1u s2u s1c s2c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s2i: INSERT INTO child VALUES (2, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1c: COMMIT;
+step s2u: <... completed>
+step s2c: COMMIT;
+
+starting permutation: s1i s2i s1u s2u s2c s1c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1i s2i s2u s1u s1c s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
 step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1i s2i s2u s1u s2c s1c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
 step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+step s2c: COMMIT;
 step s1u: <... completed>
-error in steps s2u s1u: ERROR:  deadlock detected
 step s1c: COMMIT;
+
+starting permutation: s1i s2i s2u s2c s1u s1c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
 step s2c: COMMIT;
+step s1u: UPDATE parent SET aux = 'bar';
+step s1c: COMMIT;
 
-starting permutation: s1i s2i s2u s1u s2c s1c
+starting permutation: s2i s1i s1u s1c s2u s2c
+step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s1c: COMMIT;
+step s2u: UPDATE parent SET aux = 'baz';
+step s2c: COMMIT;
+
+starting permutation: s2i s1i s1u s2u s1c s2c
 step s2i: INSERT INTO child VALUES (2, 1);
-step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1i: INSERT INTO child VALUES (1, 1);
 step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1c: COMMIT;
 step s2u: <... completed>
-error in steps s1u s2u: ERROR:  deadlock detected
 step s2c: COMMIT;
-step s1c: COMMIT;
 
-starting permutation: s2i s1i s1u s2u s1c s2c
+starting permutation: s2i s1i s1u s2u s2c s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2i s1i s2u s1u s1c s2c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2u: UPDATE parent SET aux = 'baz';
 step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2i s1i s2u s1u s2c s1c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1i: INSERT INTO child VALUES (1, 1);
 step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+step s2c: COMMIT;
 step s1u: <... completed>
-error in steps s2u s1u: ERROR:  deadlock detected
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s2i s1i s2u s1u s2c s1c
+starting permutation: s2i s1i s2u s2c s1u s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
-step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s2u: UPDATE parent SET aux = 'baz';
+step s2c: COMMIT;
 step s1u: UPDATE parent SET aux = 'bar';
-step s2u: <... completed>
-error in steps s1u s2u: ERROR:  deadlock detected
+step s1c: COMMIT;
+
+starting permutation: s2i s2u s1i s1u s1c s2c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2i s2u s1i s1u s2c s1c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
 step s2c: COMMIT;
+step s1u: <... completed>
 step s1c: COMMIT;
 
 starting permutation: s2i s2u s1i s2c s1u s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s2u: UPDATE parent SET aux = 'baz';
-step s1i: INSERT INTO child VALUES (1, 1); <waiting ...>
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2c: COMMIT;
+step s1u: UPDATE parent SET aux = 'bar';
+step s1c: COMMIT;
+
+starting permutation: s2i s2u s2c s1i s1u s1c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
 step s2c: COMMIT;
-step s1i: <... completed>
+step s1i: INSERT INTO child VALUES (1, 1);
 step s1u: UPDATE parent SET aux = 'bar';
 step s1c: COMMIT;
index 2d8e5e5b25f268bfd5da68893d07a9058b5e98ab..eda118550c231a90efcb09a24a6c681d5db87bd2 100644 (file)
@@ -17,91 +17,138 @@ step s2u1: <... completed>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2c: COMMIT;
 
+starting permutation: s1u1 s1u2 s2u1 s2u2 s1c s2c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1u1 s1u2 s2u1 s2u2 s2c s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1u1 s2u1 s1u2 s1c s2u2 s2c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
+
 starting permutation: s1u1 s2u1 s1u2 s2u2 s1c s2c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
-step s1c: COMMIT;
-step s2c: COMMIT;
+invalid permutation detected
 
 starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
 step s2c: COMMIT;
+step s1u2: <... completed>
 step s1c: COMMIT;
 
 starting permutation: s1u1 s2u1 s2u2 s1u2 s1c s2c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
-step s1c: COMMIT;
-step s2c: COMMIT;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
 
 starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
 step s1c: COMMIT;
 
+starting permutation: s1u1 s2u1 s2u2 s2c s1u2 s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s1u2 s1c s2u2 s2c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
+
 starting permutation: s2u1 s1u1 s1u2 s2u2 s1c s2c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
-step s1c: COMMIT;
-step s2c: COMMIT;
+invalid permutation detected
 
 starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
 step s2c: COMMIT;
+step s1u2: <... completed>
 step s1c: COMMIT;
 
 starting permutation: s2u1 s1u1 s2u2 s1u2 s1c s2c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
-step s1c: COMMIT;
-step s2c: COMMIT;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
 
 starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u2: <... completed>
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s2u2 s2c s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s1u1 s1u2 s1c s2c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2u1 s2u2 s1u1 s1u2 s2c s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
 step s1c: COMMIT;
 
 starting permutation: s2u1 s2u2 s1u1 s2c s1u2 s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; <waiting ...>
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2c: COMMIT;
-step s1u1: <... completed>
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s2c s1u1 s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1c: COMMIT;
index 30c4c998631afcf42ab122fada730b6d0bc2b988..382734811cbbce0bde1a5e4ef2963e23dc096661 100644 (file)
@@ -19,92 +19,87 @@ step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 ERROR:  current transaction is aborted, commands ignored until end of transaction block
 step s2c: COMMIT;
 
-starting permutation: s1u1 s2u1 s1u2 s2u2 s1c s2c
+starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
+starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
-starting permutation: s1u1 s2u1 s2u2 s1u2 s1c s2c
+starting permutation: s1u1 s2u1 s2u2 s2c s1u2 s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
-step s1c: COMMIT;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2c: COMMIT;
-
-starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
-step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
-step s2c: COMMIT;
+ERROR:  could not serialize access due to read/write dependencies among transactions
 step s1c: COMMIT;
 
-starting permutation: s2u1 s1u1 s1u2 s2u2 s1c s2c
+starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
+starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
-starting permutation: s2u1 s1u1 s2u2 s1u2 s1c s2c
+starting permutation: s2u1 s1u1 s2u2 s2c s1u2 s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
+ERROR:  could not serialize access due to read/write dependencies among transactions
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
+starting permutation: s2u1 s2u2 s1u1 s1u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
 starting permutation: s2u1 s2u2 s1u1 s2c s1u2 s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; <waiting ...>
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2c: COMMIT;
-step s1u1: <... completed>
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 ERROR:  could not serialize access due to read/write dependencies among transactions
 step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s2c s1u1 s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-deadlock2_2.out b/src/test/isolation/expected/fk-deadlock2_2.out
new file mode 100644 (file)
index 0000000..b6be4b9
--- /dev/null
@@ -0,0 +1,105 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1u1 s1u2 s1c s2u1 s2u2 s2c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+
+starting permutation: s1u1 s1u2 s2u1 s1c s2u2 s2c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s1c: COMMIT;
+step s2u1: <... completed>
+error in steps s1c s2u1: ERROR:  could not serialize access due to concurrent update
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+ERROR:  current transaction is aborted, commands ignored until end of transaction block
+step s2c: COMMIT;
+
+starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s1u1 s2u1 s2u2 s2c s1u2 s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s2u2 s2c s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s1u1 s1u2 s2c s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s1u1 s2c s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2c: COMMIT;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s2c s1u1 s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
index ca75322cc1229357a29da5a76faa6751d05ba6db..d648e48c480cb3eb63b9c2ecf6d0aa2d4ab689fd 100644 (file)
@@ -11,61 +11,57 @@ step s2c: COMMIT;
 starting permutation: s1i s1u s2i s1c s2u s2c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s1u: UPDATE parent SET aux = 'bar';
-step s2i: INSERT INTO child VALUES (2, 1); <waiting ...>
+step s2i: INSERT INTO child VALUES (2, 1);
 step s1c: COMMIT;
-step s2i: <... completed>
-error in steps s1c s2i: ERROR:  could not serialize access due to concurrent update
 step s2u: UPDATE parent SET aux = 'baz';
-ERROR:  current transaction is aborted, commands ignored until end of transaction block
+ERROR:  could not serialize access due to read/write dependencies among transactions
 step s2c: COMMIT;
 
 starting permutation: s1i s2i s1u s2u s1c s2c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s2i: INSERT INTO child VALUES (2, 1);
-step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
-step s2u: UPDATE parent SET aux = 'baz';
-step s1u: <... completed>
-error in steps s2u s1u: ERROR:  deadlock detected
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
 step s1c: COMMIT;
+step s2u: <... completed>
+error in steps s1c s2u: ERROR:  could not serialize access due to concurrent update
 step s2c: COMMIT;
 
 starting permutation: s1i s2i s2u s1u s2c s1c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s2i: INSERT INTO child VALUES (2, 1);
-step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
-step s1u: UPDATE parent SET aux = 'bar';
-step s2u: <... completed>
-error in steps s1u s2u: ERROR:  deadlock detected
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
 step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
 starting permutation: s2i s1i s1u s2u s1c s2c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
-step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
-step s2u: UPDATE parent SET aux = 'baz';
-step s1u: <... completed>
-error in steps s2u s1u: ERROR:  deadlock detected
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
 step s1c: COMMIT;
+step s2u: <... completed>
+error in steps s1c s2u: ERROR:  could not serialize access due to concurrent update
 step s2c: COMMIT;
 
 starting permutation: s2i s1i s2u s1u s2c s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
-step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
-step s1u: UPDATE parent SET aux = 'bar';
-step s2u: <... completed>
-error in steps s1u s2u: ERROR:  deadlock detected
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
 step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
 starting permutation: s2i s2u s1i s2c s1u s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s2u: UPDATE parent SET aux = 'baz';
-step s1i: INSERT INTO child VALUES (1, 1); <waiting ...>
+step s1i: INSERT INTO child VALUES (1, 1);
 step s2c: COMMIT;
-step s1i: <... completed>
-error in steps s2c s1i: ERROR:  could not serialize access due to concurrent update
 step s1u: UPDATE parent SET aux = 'bar';
-ERROR:  current transaction is aborted, commands ignored until end of transaction block
+ERROR:  could not serialize access due to read/write dependencies among transactions
 step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-deadlock_2.out b/src/test/isolation/expected/fk-deadlock_2.out
new file mode 100644 (file)
index 0000000..503a7d2
--- /dev/null
@@ -0,0 +1,67 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1i s1u s1c s2i s2u s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s1c: COMMIT;
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s2c: COMMIT;
+
+starting permutation: s1i s1u s2i s1c s2u s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1c: COMMIT;
+step s2u: UPDATE parent SET aux = 'baz';
+ERROR:  could not serialize access due to concurrent update
+step s2c: COMMIT;
+
+starting permutation: s1i s2i s1u s2u s1c s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1c: COMMIT;
+step s2u: <... completed>
+error in steps s1c s2u: ERROR:  could not serialize access due to concurrent update
+step s2c: COMMIT;
+
+starting permutation: s1i s2i s2u s1u s2c s1c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2i s1i s1u s2u s1c s2c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1c: COMMIT;
+step s2u: <... completed>
+error in steps s1c s2u: ERROR:  could not serialize access due to concurrent update
+step s2c: COMMIT;
+
+starting permutation: s2i s1i s2u s1u s2c s1c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2i s2u s1i s2c s1u s1c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2c: COMMIT;
+step s1u: UPDATE parent SET aux = 'bar';
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-delete-insert.out b/src/test/isolation/expected/fk-delete-insert.out
new file mode 100644 (file)
index 0000000..1ab15aa
--- /dev/null
@@ -0,0 +1,41 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1d s1c s2i s2c
+step s1d: DELETE FROM A WHERE AID = 1;
+step s1c: COMMIT;
+step s2i: INSERT INTO B (BID,AID,Col2) VALUES (2,1,0);
+ERROR:  insert or update on table "b" violates foreign key constraint "b_aid_fkey"
+step s2c: COMMIT;
+
+starting permutation: s1d s2i s1c s2c
+step s1d: DELETE FROM A WHERE AID = 1;
+step s2i: INSERT INTO B (BID,AID,Col2) VALUES (2,1,0); <waiting ...>
+step s1c: COMMIT;
+step s2i: <... completed>
+error in steps s1c s2i: ERROR:  insert or update on table "b" violates foreign key constraint "b_aid_fkey"
+step s2c: COMMIT;
+
+starting permutation: s1d s2i s2c s1c
+step s1d: DELETE FROM A WHERE AID = 1;
+step s2i: INSERT INTO B (BID,AID,Col2) VALUES (2,1,0); <waiting ...>
+invalid permutation detected
+
+starting permutation: s2i s1d s1c s2c
+step s2i: INSERT INTO B (BID,AID,Col2) VALUES (2,1,0);
+step s1d: DELETE FROM A WHERE AID = 1; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2i s1d s2c s1c
+step s2i: INSERT INTO B (BID,AID,Col2) VALUES (2,1,0);
+step s1d: DELETE FROM A WHERE AID = 1; <waiting ...>
+step s2c: COMMIT;
+step s1d: <... completed>
+error in steps s2c s1d: ERROR:  update or delete on table "a" violates foreign key constraint "b_aid_fkey" on table "b"
+step s1c: COMMIT;
+
+starting permutation: s2i s2c s1d s1c
+step s2i: INSERT INTO B (BID,AID,Col2) VALUES (2,1,0);
+step s2c: COMMIT;
+step s1d: DELETE FROM A WHERE AID = 1;
+ERROR:  update or delete on table "a" violates foreign key constraint "b_aid_fkey" on table "b"
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/lock-update-delete.out b/src/test/isolation/expected/lock-update-delete.out
new file mode 100644 (file)
index 0000000..c424865
--- /dev/null
@@ -0,0 +1,65 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1b s2b s1s s2u s2d s1l s2c s1c
+step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
+step s2b: BEGIN;
+step s1s: SELECT * FROM foo;
+key            value          
+
+1              1              
+step s2u: UPDATE foo SET value = 2 WHERE key = 1;
+step s2d: DELETE FROM foo;
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s2c: COMMIT;
+step s1l: <... completed>
+error in steps s2c s1l: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s1b s2b s1s s2u s2d s1l s2r s1c
+step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
+step s2b: BEGIN;
+step s1s: SELECT * FROM foo;
+key            value          
+
+1              1              
+step s2u: UPDATE foo SET value = 2 WHERE key = 1;
+step s2d: DELETE FROM foo;
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s2r: ROLLBACK;
+step s1l: <... completed>
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1b s2b s1s s2u s2u2 s1l s2c s1c
+step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
+step s2b: BEGIN;
+step s1s: SELECT * FROM foo;
+key            value          
+
+1              1              
+step s2u: UPDATE foo SET value = 2 WHERE key = 1;
+step s2u2: UPDATE foo SET key = 2 WHERE key = 1;
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s2c: COMMIT;
+step s1l: <... completed>
+error in steps s2c s1l: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s1b s2b s1s s2u s2u2 s1l s2r s1c
+step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
+step s2b: BEGIN;
+step s1s: SELECT * FROM foo;
+key            value          
+
+1              1              
+step s2u: UPDATE foo SET value = 2 WHERE key = 1;
+step s2u2: UPDATE foo SET key = 2 WHERE key = 1;
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s2r: ROLLBACK;
+step s1l: <... completed>
+key            value          
+
+1              1              
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/lock-update-traversal.out b/src/test/isolation/expected/lock-update-traversal.out
new file mode 100644 (file)
index 0000000..c8e9066
--- /dev/null
@@ -0,0 +1,18 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1b s2b s1s s2u s1l s2c s2d s1c
+step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
+step s2b: BEGIN;
+step s1s: SELECT * FROM foo;
+key            value          
+
+1              1              
+step s2u: UPDATE foo SET value = 2 WHERE key = 1;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s2d: DELETE FROM foo WHERE key = 1; <waiting ...>
+step s1c: COMMIT;
+step s2d: <... completed>
diff --git a/src/test/isolation/expected/multixact-no-deadlock.out b/src/test/isolation/expected/multixact-no-deadlock.out
new file mode 100644 (file)
index 0000000..5ba2e78
--- /dev/null
@@ -0,0 +1,24 @@
+Parsed test spec with 3 sessions
+
+starting permutation: s1lock s2lock s1svpt s3lock s1lock2 s2c s1c s3c
+step s1lock: SELECT * FROM justthis FOR SHARE;
+value          
+
+1              
+step s2lock: SELECT * FROM justthis FOR SHARE;
+value          
+
+1              
+step s1svpt: SAVEPOINT foo;
+step s3lock: SELECT * FROM justthis FOR UPDATE; <waiting ...>
+step s1lock2: SELECT * FROM justthis FOR SHARE;
+value          
+
+1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+step s3lock: <... completed>
+value          
+
+1              
+step s3c: COMMIT;
index 1d0770cd37e27fae2d050cc80323751629bd9123..c4d6719de6da19d8d525382389f5127c62323b12 100644 (file)
@@ -14,4 +14,9 @@ test: fk-contention
 test: fk-deadlock
 test: fk-deadlock2
 test: eval-plan-qual
+test: lock-update-delete
+test: lock-update-traversal
+test: delete-abort-savept
+test: delete-abort-savept-2
+test: aborted-keyrevoke
 test: drop-index-concurrently-1
index 4c4556654b390b8520289a942f8454f867bbbe51..f1bb87d2f130bbb39476cd74720092d05ee63a2e 100644 (file)
@@ -564,6 +564,7 @@ run_permutation(TestSpec * testspec, int nsteps, Step ** steps)
                         * but it can only be unblocked by running steps from other
                         * sessions.
                         */
+                       fflush(stdout);
                        fprintf(stderr, "invalid permutation detected\n");
 
                        /* Cancel the waiting statement from this session. */
diff --git a/src/test/isolation/specs/aborted-keyrevoke.spec b/src/test/isolation/specs/aborted-keyrevoke.spec
new file mode 100644 (file)
index 0000000..c60aa0c
--- /dev/null
@@ -0,0 +1,31 @@
+# When a tuple that has been updated is locked, the locking command
+# should traverse the update chain; thus, a DELETE should not be able
+# to proceed until the lock has been released.
+
+setup
+{
+  CREATE TABLE foo (
+       key             int PRIMARY KEY,
+       value   int
+  );
+
+  INSERT INTO foo VALUES (1, 1);
+}
+
+teardown
+{
+  DROP TABLE foo;
+}
+
+session "s1"
+setup          { BEGIN; }
+step "s1s"     { SAVEPOINT f; }
+step "s1u"     { UPDATE foo SET key = 2; }     # obtain KEY REVOKE
+step "s1r"     { ROLLBACK TO f; } # lose KEY REVOKE
+step "s1l"     { SELECT * FROM foo FOR KEY SHARE; }
+step "s1c"     { COMMIT; }
+
+session "s2"
+setup          { BEGIN; }
+step "s2l"     { SELECT * FROM foo FOR KEY SHARE; }
+step "s2c"     { COMMIT; }
diff --git a/src/test/isolation/specs/delete-abort-savept-2.spec b/src/test/isolation/specs/delete-abort-savept-2.spec
new file mode 100644 (file)
index 0000000..d35c67f
--- /dev/null
@@ -0,0 +1,34 @@
+# A funkier version of delete-abort-savept
+setup
+{
+  CREATE TABLE foo (
+     key INT PRIMARY KEY,
+     value INT
+  );
+
+  INSERT INTO foo VALUES (1, 1);
+}
+
+teardown
+{
+  DROP TABLE foo;
+}
+
+session "s1"
+setup                  { BEGIN; }
+step "s1l"             { SELECT * FROM foo FOR KEY SHARE; }
+step "s1svp"   { SAVEPOINT f; }
+step "s1d"             { SELECT * FROM foo FOR NO KEY UPDATE; }
+step "s1r"             { ROLLBACK TO f; }
+step "s1c"             { COMMIT; }
+
+session "s2"
+setup                  { BEGIN; }
+step "s2l"             { SELECT * FROM foo FOR UPDATE; }
+step "s2l2"            { SELECT * FROM foo FOR NO KEY UPDATE; }
+step "s2c"             { COMMIT; }
+
+permutation "s1l" "s1svp" "s1d" "s1r" "s2l" "s1c" "s2c"
+permutation "s1l" "s1svp" "s1d" "s2l" "s1r" "s1c" "s2c"
+permutation "s1l" "s1svp" "s1d" "s1r" "s2l2" "s1c" "s2c"
+permutation "s1l" "s1svp" "s1d" "s2l2" "s1r" "s1c" "s2c"
diff --git a/src/test/isolation/specs/delete-abort-savept.spec b/src/test/isolation/specs/delete-abort-savept.spec
new file mode 100644 (file)
index 0000000..e41df20
--- /dev/null
@@ -0,0 +1,29 @@
+# After rolling back a subtransaction that upgraded a lock, the previously
+# held lock should still be held.
+setup
+{
+  CREATE TABLE foo (
+     key INT PRIMARY KEY,
+     value INT
+  );
+
+  INSERT INTO foo VALUES (1, 1);
+}
+
+teardown
+{
+  DROP TABLE foo;
+}
+
+session "s1"
+setup                  { BEGIN; }
+step "s1l"             { SELECT * FROM foo FOR KEY SHARE; }
+step "s1svp"   { SAVEPOINT f; }
+step "s1d"             { DELETE FROM foo; }
+step "s1r"             { ROLLBACK TO f; }
+step "s1c"             { COMMIT; }
+
+session "s2"
+setup                  { BEGIN; }
+step "s2l"             { SELECT * FROM foo FOR UPDATE; }
+step "s2c"             { COMMIT; }
index 9f46c6b665c4adcb1c38f5124722668a0c527cfa..44500d5b9bb50385c18398bec23445c6e4ed056b 100644 (file)
@@ -29,26 +29,3 @@ setup                { BEGIN; SET deadlock_timeout = '10s'; }
 step "s2i"     { INSERT INTO child VALUES (2, 1); }
 step "s2u"     { UPDATE parent SET aux = 'baz'; }
 step "s2c"     { COMMIT; }
-
-## Most theoretical permutations require that a blocked session execute a
-## command, making them impossible in practice.
-permutation "s1i" "s1u" "s1c" "s2i" "s2u" "s2c"
-permutation "s1i" "s1u" "s2i" "s1c" "s2u" "s2c"
-#permutation "s1i" "s1u" "s2i" "s2u" "s1c" "s2c"
-#permutation "s1i" "s1u" "s2i" "s2u" "s2c" "s1c"
-#permutation "s1i" "s2i" "s1u" "s1c" "s2u" "s2c"
-permutation "s1i" "s2i" "s1u" "s2u" "s1c" "s2c"
-#permutation "s1i" "s2i" "s1u" "s2u" "s2c" "s1c"
-#permutation "s1i" "s2i" "s2u" "s1u" "s1c" "s2c"
-permutation "s1i" "s2i" "s2u" "s1u" "s2c" "s1c"
-#permutation "s1i" "s2i" "s2u" "s2c" "s1u" "s1c"
-#permutation "s2i" "s1i" "s1u" "s1c" "s2u" "s2c"
-permutation "s2i" "s1i" "s1u" "s2u" "s1c" "s2c"
-#permutation "s2i" "s1i" "s1u" "s2u" "s2c" "s1c"
-#permutation "s2i" "s1i" "s2u" "s1u" "s1c" "s2c"
-permutation "s2i" "s1i" "s2u" "s1u" "s2c" "s1c"
-#permutation "s2i" "s1i" "s2u" "s2c" "s1u" "s1c"
-#permutation "s2i" "s2u" "s1i" "s1u" "s1c" "s2c"
-#permutation "s2i" "s2u" "s1i" "s1u" "s2c" "s1c"
-permutation "s2i" "s2u" "s1i" "s2c" "s1u" "s1c"
-#permutation "s2i" "s2u" "s2c" "s1i" "s1u" "s1c"
index a8f1516c4ec10cbcff052a61fc9527f4e0040f6c..f500b26585c4d5115f15233030892f66ce68480b 100644 (file)
@@ -34,26 +34,3 @@ setup                { BEGIN; SET deadlock_timeout = '10s'; }
 step "s2u1"    { UPDATE B SET Col2 = 1 WHERE BID = 2; }
 step "s2u2"    { UPDATE B SET Col2 = 1 WHERE BID = 2; }
 step "s2c"     { COMMIT; }
-
-## Many theoretical permutations require that a blocked session execute a
-## command, making them impossible in practice.
-permutation "s1u1" "s1u2" "s1c" "s2u1" "s2u2" "s2c"
-permutation "s1u1" "s1u2" "s2u1" "s1c" "s2u2" "s2c"
-#permutation "s1u1" "s1u2" "s2u1" "s2u2" "s1c" "s2c"
-#permutation "s1u1" "s1u2" "s2u1" "s2u2" "s2c" "s1c"
-#permutation "s1u1" "s2u1" "s1u2" "s1c" "s2u2" "s2c"
-permutation "s1u1" "s2u1" "s1u2" "s2u2" "s1c" "s2c"
-permutation "s1u1" "s2u1" "s1u2" "s2u2" "s2c" "s1c"
-permutation "s1u1" "s2u1" "s2u2" "s1u2" "s1c" "s2c"
-permutation "s1u1" "s2u1" "s2u2" "s1u2" "s2c" "s1c"
-#permutation "s1u1" "s2u1" "s2u2" "s2c" "s1u2" "s1c"
-#permutation "s2u1" "s1u1" "s1u2" "s1c" "s2u2" "s2c"
-permutation "s2u1" "s1u1" "s1u2" "s2u2" "s1c" "s2c"
-permutation "s2u1" "s1u1" "s1u2" "s2u2" "s2c" "s1c"
-permutation "s2u1" "s1u1" "s2u2" "s1u2" "s1c" "s2c"
-permutation "s2u1" "s1u1" "s2u2" "s1u2" "s2c" "s1c"
-#permutation "s2u1" "s1u1" "s2u2" "s2c" "s1u2" "s1c"
-#permutation "s2u1" "s2u2" "s1u1" "s1u2" "s1c" "s2c"
-#permutation "s2u1" "s2u2" "s1u1" "s1u2" "s2c" "s1c"
-permutation "s2u1" "s2u2" "s1u1" "s2c" "s1u2" "s1c"
-#permutation "s2u1" "s2u2" "s2c" "s1u1" "s1u2" "s1c"
diff --git a/src/test/isolation/specs/lock-update-delete.spec b/src/test/isolation/specs/lock-update-delete.spec
new file mode 100644 (file)
index 0000000..4b9a5a6
--- /dev/null
@@ -0,0 +1,38 @@
+# If we update a tuple, and then delete (or update that touches the key) it,
+# and later somebody tries to come along and traverse that update chain,
+# he should get an error when locking the latest version, if the delete
+# committed; or succeed, when the deleting transaction rolls back.
+
+setup
+{
+  CREATE TABLE foo (
+       key             int PRIMARY KEY,
+       value   int
+  );
+
+  INSERT INTO foo VALUES (1, 1);
+}
+
+teardown
+{
+  DROP TABLE foo;
+}
+
+session "s1"
+step "s1b"     { BEGIN ISOLATION LEVEL REPEATABLE READ; }
+step "s1s"     { SELECT * FROM foo; }  # obtain snapshot
+step "s1l"     { SELECT * FROM foo FOR KEY SHARE; } # obtain lock
+step "s1c"     { COMMIT; }
+
+session "s2"
+step "s2b"     { BEGIN; }
+step "s2u"     { UPDATE foo SET value = 2 WHERE key = 1; }
+step "s2d"     { DELETE FROM foo; }
+step "s2u2"    { UPDATE foo SET key = 2 WHERE key = 1; }
+step "s2c"     { COMMIT; }
+step "s2r"     { ROLLBACK; }
+
+permutation "s1b" "s2b" "s1s" "s2u" "s2d" "s1l" "s2c" "s1c"
+permutation "s1b" "s2b" "s1s" "s2u" "s2d" "s1l" "s2r" "s1c"
+permutation "s1b" "s2b" "s1s" "s2u" "s2u2" "s1l" "s2c" "s1c"
+permutation "s1b" "s2b" "s1s" "s2u" "s2u2" "s1l" "s2r" "s1c"
diff --git a/src/test/isolation/specs/lock-update-traversal.spec b/src/test/isolation/specs/lock-update-traversal.spec
new file mode 100644 (file)
index 0000000..6c6c805
--- /dev/null
@@ -0,0 +1,32 @@
+# When a tuple that has been updated is locked, the locking command
+# should traverse the update chain; thus, a DELETE should not be able
+# to proceed until the lock has been released.
+
+setup
+{
+  CREATE TABLE foo (
+       key             int PRIMARY KEY,
+       value   int
+  );
+
+  INSERT INTO foo VALUES (1, 1);
+}
+
+teardown
+{
+  DROP TABLE foo;
+}
+
+session "s1"
+step "s1b"     { BEGIN ISOLATION LEVEL REPEATABLE READ; }
+step "s1s"     { SELECT * FROM foo; }  # obtain snapshot
+step "s1l"     { SELECT * FROM foo FOR KEY SHARE; } # obtain lock
+step "s1c"     { COMMIT; }
+
+session "s2"
+step "s2b"     { BEGIN; }
+step "s2u"     { UPDATE foo SET value = 2 WHERE key = 1; }
+step "s2c"     { COMMIT; }
+step "s2d"     { DELETE FROM foo WHERE key = 1; }
+
+permutation "s1b" "s2b" "s1s" "s2u" "s1l" "s2c" "s2d" "s1c"
diff --git a/src/test/isolation/specs/multixact-no-deadlock.spec b/src/test/isolation/specs/multixact-no-deadlock.spec
new file mode 100644 (file)
index 0000000..205658b
--- /dev/null
@@ -0,0 +1,35 @@
+# If we already hold a lock of a given strength, do not deadlock when
+# some other transaction is waiting for a conflicting lock and we try
+# to acquire the same lock we already held.
+setup
+{
+  CREATE TABLE justthis (
+       value   int
+  );
+
+  INSERT INTO justthis VALUES (1);
+}
+
+teardown
+{
+  DROP TABLE justthis;
+}
+
+session "s1"
+setup                  { BEGIN; }
+step "s1lock"  { SELECT * FROM justthis FOR SHARE; }
+step "s1svpt"  { SAVEPOINT foo; }
+step "s1lock2" { SELECT * FROM justthis FOR SHARE; }
+step "s1c"             { COMMIT; }
+
+session "s2"
+setup                  { BEGIN; }
+step "s2lock"  { SELECT * FROM justthis FOR SHARE; }   # ensure it's a multi
+step "s2c"             { COMMIT; }
+
+session "s3"
+setup                  { BEGIN; }
+step "s3lock"  { SELECT * FROM justthis FOR UPDATE; }
+step "s3c"             { COMMIT; }
+
+permutation "s1lock" "s2lock" "s1svpt" "s3lock" "s1lock2" "s2c" "s1c" "s3c"