From 0ac5ad5134f2769ccbaefec73844f8504c4d6182 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Wed, 23 Jan 2013 12:04:59 -0300
Subject: [PATCH] Improve concurrency of foreign key locking

This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE".  These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE".  UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.

Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.

The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid.  Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates.  This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed.  pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.

Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header.  This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.

Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)

With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.

As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.

Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane.  There's probably room for several more tests.

There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it.  Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.

This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
	AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
	1290721684-sup-3951@alvh.no-ip.org
	1294953201-sup-2099@alvh.no-ip.org
	1320343602-sup-2290@alvh.no-ip.org
	1339690386-sup-8927@alvh.no-ip.org
	4FE5FF020200002500048A3D@gw.wicourts.gov
	4FEAB90A0200002500048B7D@gw.wicourts.gov
---
 contrib/file_fdw/output/file_fdw.source       |    2 +-
 contrib/pageinspect/heapfuncs.c               |    2 +-
 contrib/pg_upgrade/controldata.c              |   46 +
 contrib/pg_upgrade/pg_upgrade.c               |   46 +
 contrib/pg_upgrade/pg_upgrade.h               |    7 +
 contrib/pgrowlocks/Makefile                   |    2 +-
 contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql   |   17 +
 ...growlocks--1.0.sql => pgrowlocks--1.1.sql} |    4 +-
 contrib/pgrowlocks/pgrowlocks.c               |  178 +-
 contrib/pgrowlocks/pgrowlocks.control         |    2 +-
 doc/src/sgml/pgrowlocks.sgml                  |   15 +-
 doc/src/sgml/ref/select.sgml                  |  146 +-
 src/backend/access/common/heaptuple.c         |    2 +-
 src/backend/access/heap/README.tuplock        |  139 +
 src/backend/access/heap/heapam.c              | 2245 +++++++++++++----
 src/backend/access/heap/pruneheap.c           |   10 +-
 src/backend/access/heap/rewriteheap.c         |   17 +-
 src/backend/access/rmgrdesc/heapdesc.c        |   56 +-
 src/backend/access/rmgrdesc/mxactdesc.c       |   37 +-
 src/backend/access/rmgrdesc/xlogdesc.c        |    5 +-
 src/backend/access/transam/README             |    6 +-
 src/backend/access/transam/multixact.c        | 1191 ++++++---
 src/backend/access/transam/varsup.c           |    2 +
 src/backend/access/transam/xlog.c             |   14 +-
 src/backend/catalog/heap.c                    |   14 +-
 src/backend/catalog/index.c                   |    9 +-
 src/backend/commands/analyze.c                |    9 +-
 src/backend/commands/cluster.c                |   37 +-
 src/backend/commands/dbcommands.c             |   15 +-
 src/backend/commands/sequence.c               |   10 +-
 src/backend/commands/tablecmds.c              |   12 +-
 src/backend/commands/trigger.c                |   32 +-
 src/backend/commands/vacuum.c                 |   96 +-
 src/backend/commands/vacuumlazy.c             |   24 +-
 src/backend/executor/execMain.c               |   21 +-
 src/backend/executor/nodeLockRows.c           |   25 +-
 src/backend/executor/nodeModifyTable.c        |    6 +-
 src/backend/nodes/copyfuncs.c                 |    4 +-
 src/backend/nodes/equalfuncs.c                |    4 +-
 src/backend/nodes/outfuncs.c                  |    4 +-
 src/backend/nodes/readfuncs.c                 |    2 +-
 src/backend/optimizer/plan/initsplan.c        |    6 +-
 src/backend/optimizer/plan/planner.c          |   37 +-
 src/backend/parser/analyze.c                  |   58 +-
 src/backend/parser/gram.y                     |   29 +-
 src/backend/postmaster/autovacuum.c           |   45 +-
 src/backend/rewrite/rewriteHandler.c          |   32 +-
 src/backend/storage/lmgr/lock.c               |   15 +-
 src/backend/storage/lmgr/predicate.c          |    4 +-
 src/backend/tcop/utility.c                    |   50 +-
 src/backend/utils/adt/ri_triggers.c           |   23 +-
 src/backend/utils/adt/ruleutils.c             |   28 +-
 src/backend/utils/cache/relcache.c            |   31 +-
 src/backend/utils/time/combocid.c             |    5 +-
 src/backend/utils/time/tqual.c                |  466 +++-
 src/bin/pg_controldata/pg_controldata.c       |    4 +
 src/bin/pg_resetxlog/pg_resetxlog.c           |   37 +-
 src/include/access/heapam.h                   |   23 +-
 src/include/access/heapam_xlog.h              |   33 +-
 src/include/access/htup.h                     |    6 +-
 src/include/access/htup_details.h             |   62 +-
 src/include/access/multixact.h                |   66 +-
 src/include/access/rewriteheap.h              |    2 +-
 src/include/catalog/catversion.h              |    2 +-
 src/include/catalog/pg_class.h                |   24 +-
 src/include/catalog/pg_control.h              |    4 +-
 src/include/catalog/pg_database.h             |   10 +-
 src/include/catalog/pg_proc.h                 |    2 +
 src/include/commands/cluster.h                |    3 +-
 src/include/commands/vacuum.h                 |    6 +-
 src/include/executor/executor.h               |    2 +-
 src/include/nodes/execnodes.h                 |    8 +-
 src/include/nodes/parsenodes.h                |   36 +-
 src/include/nodes/plannodes.h                 |   12 +-
 src/include/parser/analyze.h                  |    2 +-
 src/include/postgres.h                        |    7 +
 src/include/storage/lock.h                    |    1 +
 src/include/utils/builtins.h                  |    3 +
 src/include/utils/rel.h                       |    1 +
 src/include/utils/relcache.h                  |    4 +-
 src/include/utils/tqual.h                     |    1 +
 .../isolation/expected/aborted-keyrevoke.out  |  276 ++
 .../expected/aborted-keyrevoke_2.out          |  278 ++
 .../expected/delete-abort-savept-2.out        |   76 +
 .../expected/delete-abort-savept.out          |  243 ++
 src/test/isolation/expected/fk-contention.out |    3 +-
 src/test/isolation/expected/fk-deadlock.out   |  126 +-
 src/test/isolation/expected/fk-deadlock2.out  |  113 +-
 .../isolation/expected/fk-deadlock2_1.out     |   75 +-
 .../isolation/expected/fk-deadlock2_2.out     |  105 +
 src/test/isolation/expected/fk-deadlock_1.out |   44 +-
 src/test/isolation/expected/fk-deadlock_2.out |   67 +
 .../isolation/expected/fk-delete-insert.out   |   41 +
 .../isolation/expected/lock-update-delete.out |   65 +
 .../expected/lock-update-traversal.out        |   18 +
 .../expected/multixact-no-deadlock.out        |   24 +
 src/test/isolation/isolation_schedule         |    5 +
 src/test/isolation/isolationtester.c          |    1 +
 .../isolation/specs/aborted-keyrevoke.spec    |   31 +
 .../specs/delete-abort-savept-2.spec          |   34 +
 .../isolation/specs/delete-abort-savept.spec  |   29 +
 src/test/isolation/specs/fk-deadlock.spec     |   23 -
 src/test/isolation/specs/fk-deadlock2.spec    |   23 -
 .../isolation/specs/lock-update-delete.spec   |   38 +
 .../specs/lock-update-traversal.spec          |   32 +
 .../specs/multixact-no-deadlock.spec          |   35 +
 106 files changed, 6023 insertions(+), 1487 deletions(-)
 create mode 100644 contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql
 rename contrib/pgrowlocks/{pgrowlocks--1.0.sql => pgrowlocks--1.1.sql} (83%)
 create mode 100644 src/backend/access/heap/README.tuplock
 create mode 100644 src/test/isolation/expected/aborted-keyrevoke.out
 create mode 100644 src/test/isolation/expected/aborted-keyrevoke_2.out
 create mode 100644 src/test/isolation/expected/delete-abort-savept-2.out
 create mode 100644 src/test/isolation/expected/delete-abort-savept.out
 create mode 100644 src/test/isolation/expected/fk-deadlock2_2.out
 create mode 100644 src/test/isolation/expected/fk-deadlock_2.out
 create mode 100644 src/test/isolation/expected/fk-delete-insert.out
 create mode 100644 src/test/isolation/expected/lock-update-delete.out
 create mode 100644 src/test/isolation/expected/lock-update-traversal.out
 create mode 100644 src/test/isolation/expected/multixact-no-deadlock.out
 create mode 100644 src/test/isolation/specs/aborted-keyrevoke.spec
 create mode 100644 src/test/isolation/specs/delete-abort-savept-2.spec
 create mode 100644 src/test/isolation/specs/delete-abort-savept.spec
 create mode 100644 src/test/isolation/specs/lock-update-delete.spec
 create mode 100644 src/test/isolation/specs/lock-update-traversal.spec
 create mode 100644 src/test/isolation/specs/multixact-no-deadlock.spec

diff --git a/contrib/file_fdw/output/file_fdw.source b/contrib/file_fdw/output/file_fdw.source
index 6f906e1fc8..c01f8d804b 100644
--- a/contrib/file_fdw/output/file_fdw.source
+++ b/contrib/file_fdw/output/file_fdw.source
@@ -191,7 +191,7 @@ ERROR:  cannot change foreign table "agg_csv"
 DELETE FROM agg_csv WHERE a = 100;
 ERROR:  cannot change foreign table "agg_csv"
 SELECT * FROM agg_csv FOR UPDATE OF agg_csv;
-ERROR:  SELECT FOR UPDATE/SHARE cannot be used with foreign table "agg_csv"
+ERROR:  SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be used with foreign table "agg_csv"
 LINE 1: SELECT * FROM agg_csv FOR UPDATE OF agg_csv;
                                             ^
 -- but this should be ignored
diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c
index bbf796ff43..6d8f6f1c74 100644
--- a/contrib/pageinspect/heapfuncs.c
+++ b/contrib/pageinspect/heapfuncs.c
@@ -163,7 +163,7 @@ heap_page_items(PG_FUNCTION_ARGS)
 			tuphdr = (HeapTupleHeader) PageGetItem(page, id);
 
 			values[4] = UInt32GetDatum(HeapTupleHeaderGetXmin(tuphdr));
-			values[5] = UInt32GetDatum(HeapTupleHeaderGetXmax(tuphdr));
+			values[5] = UInt32GetDatum(HeapTupleHeaderGetRawXmax(tuphdr));
 			values[6] = UInt32GetDatum(HeapTupleHeaderGetRawCommandId(tuphdr)); /* shared with xvac */
 			values[7] = PointerGetDatum(&tuphdr->t_ctid);
 			values[8] = UInt32GetDatum(tuphdr->t_infomask2);
diff --git a/contrib/pg_upgrade/controldata.c b/contrib/pg_upgrade/controldata.c
index 9218f65abc..7c80c87315 100644
--- a/contrib/pg_upgrade/controldata.c
+++ b/contrib/pg_upgrade/controldata.c
@@ -40,6 +40,9 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 	bool		got_xid = false;
 	bool		got_oid = false;
 	bool		got_nextxlogfile = false;
+	bool		got_multi = false;
+	bool		got_mxoff = false;
+	bool		got_oldestmulti = false;
 	bool		got_log_id = false;
 	bool		got_log_seg = false;
 	bool		got_tli = false;
@@ -246,6 +249,39 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 			cluster->controldata.chkpnt_nxtoid = str2uint(p);
 			got_oid = true;
 		}
+		else if ((p = strstr(bufin, "Latest checkpoint's NextMultiXactId:")) != NULL)
+		{
+			p = strchr(p, ':');
+
+			if (p == NULL || strlen(p) <= 1)
+				pg_log(PG_FATAL, "%d: controldata retrieval problem\n", __LINE__);
+
+			p++;				/* removing ':' char */
+			cluster->controldata.chkpnt_nxtmulti = str2uint(p);
+			got_multi = true;
+		}
+		else if ((p = strstr(bufin, "Latest checkpoint's oldestMultiXid:")) != NULL)
+		{
+			p = strchr(p, ':');
+
+			if (p == NULL || strlen(p) <= 1)
+				pg_log(PG_FATAL, "%d: controldata retrieval problem\n", __LINE__);
+
+			p++;				/* removing ':' char */
+			cluster->controldata.chkpnt_oldstMulti = str2uint(p);
+			got_oldestmulti = true;
+		}
+		else if ((p = strstr(bufin, "Latest checkpoint's NextMultiOffset:")) != NULL)
+		{
+			p = strchr(p, ':');
+
+			if (p == NULL || strlen(p) <= 1)
+				pg_log(PG_FATAL, "%d: controldata retrieval problem\n", __LINE__);
+
+			p++;				/* removing ':' char */
+			cluster->controldata.chkpnt_nxtmxoff = str2uint(p);
+			got_mxoff = true;
+		}
 		else if ((p = strstr(bufin, "Maximum data alignment:")) != NULL)
 		{
 			p = strchr(p, ':');
@@ -433,6 +469,7 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 
 	/* verify that we got all the mandatory pg_control data */
 	if (!got_xid || !got_oid ||
+		!got_multi || !got_mxoff || !got_oldestmulti ||
 		(!live_check && !got_nextxlogfile) ||
 		!got_tli ||
 		!got_align || !got_blocksz || !got_largesz || !got_walsz ||
@@ -448,6 +485,15 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 		if (!got_oid)
 			pg_log(PG_REPORT, "  latest checkpoint next OID\n");
 
+		if (!got_multi)
+			pg_log(PG_REPORT, "  latest checkpoint next MultiXactId\n");
+
+		if (!got_mxoff)
+			pg_log(PG_REPORT, "  latest checkpoint next MultiXactOffset\n");
+
+		if (!got_oldestmulti)
+			pg_log(PG_REPORT, "  latest checkpoint oldest MultiXactId\n");
+
 		if (!live_check && !got_nextxlogfile)
 			pg_log(PG_REPORT, "  first WAL segment after reset\n");
 
diff --git a/contrib/pg_upgrade/pg_upgrade.c b/contrib/pg_upgrade/pg_upgrade.c
index 88494b8d6d..a752fe8eda 100644
--- a/contrib/pg_upgrade/pg_upgrade.c
+++ b/contrib/pg_upgrade/pg_upgrade.c
@@ -382,6 +382,52 @@ copy_clog_xlog_xid(void)
 			  new_cluster.pgdata);
 	check_ok();
 
+	/*
+	 * If both new and old are after the pg_multixact change commit, copy those
+	 * files too.  If the old server is before that change and the new server
+	 * is after, then we don't copy anything but we need to reset pg_control so
+	 * that the new server doesn't attempt to read multis older than the cutoff
+	 * value.
+	 */
+	if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
+		new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+	{
+		copy_subdir_files("pg_multixact/offsets");
+		copy_subdir_files("pg_multixact/members");
+		prep_status("Setting next multixact ID and offset for new cluster");
+		/*
+		 * we preserve all files and contents, so we must preserve both "next"
+		 * counters here and the oldest multi present on system.
+		 */
+		exec_prog(UTILITY_LOG_FILE, NULL, true,
+				  "\"%s/pg_resetxlog\" -O %u -m %u,%u \"%s\"",
+				  new_cluster.bindir,
+				  old_cluster.controldata.chkpnt_nxtmxoff,
+				  old_cluster.controldata.chkpnt_nxtmulti,
+				  old_cluster.controldata.chkpnt_oldstMulti,
+				  new_cluster.pgdata);
+		check_ok();
+	}
+	else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+	{
+		prep_status("Setting oldest multixact ID on new cluster");
+		/*
+		 * We don't preserve files in this case, but it's important that the
+		 * oldest multi is set to the latest value used by the old system, so
+		 * that multixact.c returns the empty set for multis that might be
+		 * present on disk.  We set next multi to the value following that; it
+		 * might end up wrapped around (i.e. 0) if the old cluster had
+		 * next=MaxMultiXactId, but multixact.c can cope with that just fine.
+		 */
+		exec_prog(UTILITY_LOG_FILE, NULL, true,
+				  "\"%s/pg_resetxlog\" -m %u,%u \"%s\"",
+				  new_cluster.bindir,
+				  old_cluster.controldata.chkpnt_nxtmulti + 1,
+				  old_cluster.controldata.chkpnt_nxtmulti,
+				  new_cluster.pgdata);
+		check_ok();
+	}
+
 	/* now reset the wal archives in the new cluster */
 	prep_status("Resetting WAL archives");
 	exec_prog(UTILITY_LOG_FILE, NULL, true,
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
index d5c3fa9e83..70b9381667 100644
--- a/contrib/pg_upgrade/pg_upgrade.h
+++ b/contrib/pg_upgrade/pg_upgrade.h
@@ -108,6 +108,10 @@ extern char *output_files[];
  */
 #define VISIBILITY_MAP_CRASHSAFE_CAT_VER 201107031
 
+/*
+ * pg_multixact format changed in this catversion:
+ */
+#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
 
 /*
  * Each relation is represented by a relinfo structure.
@@ -182,6 +186,9 @@ typedef struct
 	uint32		chkpnt_tli;
 	uint32		chkpnt_nxtxid;
 	uint32		chkpnt_nxtoid;
+	uint32		chkpnt_nxtmulti;
+	uint32		chkpnt_nxtmxoff;
+	uint32		chkpnt_oldstMulti;
 	uint32		align;
 	uint32		blocksz;
 	uint32		largesz;
diff --git a/contrib/pgrowlocks/Makefile b/contrib/pgrowlocks/Makefile
index f56389b0e2..fe8042344f 100644
--- a/contrib/pgrowlocks/Makefile
+++ b/contrib/pgrowlocks/Makefile
@@ -4,7 +4,7 @@ MODULE_big	= pgrowlocks
 OBJS		= pgrowlocks.o
 
 EXTENSION = pgrowlocks
-DATA = pgrowlocks--1.0.sql pgrowlocks--unpackaged--1.0.sql
+DATA = pgrowlocks--1.1.sql pgrowlocks--1.0--1.1.sql pgrowlocks--unpackaged--1.0.sql
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql b/contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql
new file mode 100644
index 0000000000..d98cd807ca
--- /dev/null
+++ b/contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql
@@ -0,0 +1,17 @@
+/* contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pgrowlocks" to load this file. \quit
+
+ALTER EXTENSION pgrowlocks DROP FUNCTION pgrowlocks(text);
+DROP FUNCTION pgrowlocks(text);
+CREATE FUNCTION pgrowlocks(IN relname text,
+    OUT locked_row TID,		-- row TID
+    OUT locker XID,		-- locking XID
+    OUT multi bool,		-- multi XID?
+    OUT xids xid[],		-- multi XIDs
+    OUT modes text[],		-- multi XID statuses
+    OUT pids INTEGER[])		-- locker's process id
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pgrowlocks'
+LANGUAGE C STRICT;
diff --git a/contrib/pgrowlocks/pgrowlocks--1.0.sql b/contrib/pgrowlocks/pgrowlocks--1.1.sql
similarity index 83%
rename from contrib/pgrowlocks/pgrowlocks--1.0.sql
rename to contrib/pgrowlocks/pgrowlocks--1.1.sql
index a909b7430d..29079f4923 100644
--- a/contrib/pgrowlocks/pgrowlocks--1.0.sql
+++ b/contrib/pgrowlocks/pgrowlocks--1.1.sql
@@ -1,14 +1,14 @@
-/* contrib/pgrowlocks/pgrowlocks--1.0.sql */
+/* contrib/pgrowlocks/pgrowlocks--1.1.sql */
 
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "CREATE EXTENSION pgrowlocks" to load this file. \quit
 
 CREATE FUNCTION pgrowlocks(IN relname text,
     OUT locked_row TID,		-- row TID
-    OUT lock_type TEXT,		-- lock type
     OUT locker XID,		-- locking XID
     OUT multi bool,		-- multi XID?
     OUT xids xid[],		-- multi XIDs
+    OUT modes text[],		-- multi XID statuses
     OUT pids INTEGER[])		-- locker's process id
 RETURNS SETOF record
 AS 'MODULE_PATHNAME', 'pgrowlocks'
diff --git a/contrib/pgrowlocks/pgrowlocks.c b/contrib/pgrowlocks/pgrowlocks.c
index 20beed2a30..43ada57352 100644
--- a/contrib/pgrowlocks/pgrowlocks.c
+++ b/contrib/pgrowlocks/pgrowlocks.c
@@ -59,6 +59,13 @@ typedef struct
 	int			ncolumns;
 } MyData;
 
+#define		Atnum_tid		0
+#define		Atnum_xmax		1
+#define		Atnum_ismulti	2
+#define		Atnum_xids		3
+#define		Atnum_modes		4
+#define		Atnum_pids		5
+
 Datum
 pgrowlocks(PG_FUNCTION_ARGS)
 {
@@ -117,79 +124,146 @@ pgrowlocks(PG_FUNCTION_ARGS)
 	/* scan the relation */
 	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 	{
+		HTSU_Result	htsu;
+		TransactionId xmax;
+		uint16		infomask;
+
 		/* must hold a buffer lock to call HeapTupleSatisfiesUpdate */
 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
 
-		if (HeapTupleSatisfiesUpdate(tuple->t_data,
-									 GetCurrentCommandId(false),
-									 scan->rs_cbuf) == HeapTupleBeingUpdated)
+		htsu = HeapTupleSatisfiesUpdate(tuple->t_data,
+										GetCurrentCommandId(false),
+										scan->rs_cbuf);
+		xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
+		infomask = tuple->t_data->t_infomask;
+
+		/*
+		 * a tuple is locked if HTSU returns BeingUpdated, and if it returns
+		 * MayBeUpdated but the Xmax is valid and pointing at us.
+		 */
+		if (htsu == HeapTupleBeingUpdated ||
+			(htsu == HeapTupleMayBeUpdated &&
+			 !(infomask & HEAP_XMAX_INVALID) &&
+			 !(infomask & HEAP_XMAX_IS_MULTI) &&
+			 (xmax == GetCurrentTransactionIdIfAny())))
 		{
-
 			char	  **values;
-			int			i;
 
 			values = (char **) palloc(mydata->ncolumns * sizeof(char *));
 
-			i = 0;
-			values[i++] = (char *) DirectFunctionCall1(tidout, PointerGetDatum(&tuple->t_self));
+			values[Atnum_tid] = (char *) DirectFunctionCall1(tidout,
+															 PointerGetDatum(&tuple->t_self));
 
-			if (tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK)
-				values[i++] = pstrdup("Shared");
-			else
-				values[i++] = pstrdup("Exclusive");
-			values[i] = palloc(NCHARS * sizeof(char));
-			snprintf(values[i++], NCHARS, "%d", HeapTupleHeaderGetXmax(tuple->t_data));
-			if (tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)
+			values[Atnum_xmax] = palloc(NCHARS * sizeof(char));
+			snprintf(values[Atnum_xmax], NCHARS, "%d", xmax);
+			if (infomask & HEAP_XMAX_IS_MULTI)
 			{
-				TransactionId *xids;
-				int			nxids;
-				int			j;
-				int			isValidXid = 0;		/* any valid xid ever exists? */
-
-				values[i++] = pstrdup("true");
-				nxids = GetMultiXactIdMembers(HeapTupleHeaderGetXmax(tuple->t_data), &xids);
-				if (nxids == -1)
+				MultiXactMember *members;
+				int			nmembers;
+				bool		first = true;
+				bool		allow_old;
+
+				values[Atnum_ismulti] = pstrdup("true");
+
+				allow_old = !(infomask & HEAP_LOCK_MASK) &&
+							 (infomask & HEAP_XMAX_LOCK_ONLY);
+				nmembers = GetMultiXactIdMembers(xmax, &members, allow_old);
+				if (nmembers == -1)
 				{
-					elog(ERROR, "GetMultiXactIdMembers returns error");
+					values[Atnum_xids] = "{0}";
+					values[Atnum_modes] = "{transient upgrade status}";
+					values[Atnum_pids] = "{0}";
 				}
+				else
+				{
+					int			j;
 
-				values[i] = palloc(NCHARS * nxids);
-				values[i + 1] = palloc(NCHARS * nxids);
-				strcpy(values[i], "{");
-				strcpy(values[i + 1], "{");
+					values[Atnum_xids] = palloc(NCHARS * nmembers);
+					values[Atnum_modes] = palloc(NCHARS * nmembers);
+					values[Atnum_pids] = palloc(NCHARS * nmembers);
 
-				for (j = 0; j < nxids; j++)
-				{
-					char		buf[NCHARS];
+					strcpy(values[Atnum_xids], "{");
+					strcpy(values[Atnum_modes], "{");
+					strcpy(values[Atnum_pids], "{");
 
-					if (TransactionIdIsInProgress(xids[j]))
+					for (j = 0; j < nmembers; j++)
 					{
-						if (isValidXid)
+						char		buf[NCHARS];
+
+						if (!first)
 						{
-							strcat(values[i], ",");
-							strcat(values[i + 1], ",");
+							strcat(values[Atnum_xids], ",");
+							strcat(values[Atnum_modes], ",");
+							strcat(values[Atnum_pids], ",");
 						}
-						snprintf(buf, NCHARS, "%d", xids[j]);
-						strcat(values[i], buf);
-						snprintf(buf, NCHARS, "%d", BackendXidGetPid(xids[j]));
-						strcat(values[i + 1], buf);
+						snprintf(buf, NCHARS, "%d", members[j].xid);
+						strcat(values[Atnum_xids], buf);
+						switch (members[j].status)
+						{
+							case MultiXactStatusUpdate:
+								snprintf(buf, NCHARS, "Update");
+								break;
+							case MultiXactStatusNoKeyUpdate:
+								snprintf(buf, NCHARS, "No Key Update");
+								break;
+							case MultiXactStatusForUpdate:
+								snprintf(buf, NCHARS, "For Update");
+								break;
+							case MultiXactStatusForNoKeyUpdate:
+								snprintf(buf, NCHARS, "For No Key Update");
+								break;
+							case MultiXactStatusForShare:
+								snprintf(buf, NCHARS, "Share");
+								break;
+							case MultiXactStatusForKeyShare:
+								snprintf(buf, NCHARS, "Key Share");
+								break;
+						}
+						strcat(values[Atnum_modes], buf);
+						snprintf(buf, NCHARS, "%d",
+								 BackendXidGetPid(members[j].xid));
+						strcat(values[Atnum_pids], buf);
 
-						isValidXid = 1;
+						first = false;
 					}
-				}
 
-				strcat(values[i], "}");
-				strcat(values[i + 1], "}");
-				i++;
+					strcat(values[Atnum_xids], "}");
+					strcat(values[Atnum_modes], "}");
+					strcat(values[Atnum_pids], "}");
+				}
 			}
 			else
 			{
-				values[i++] = pstrdup("false");
-				values[i] = palloc(NCHARS * sizeof(char));
-				snprintf(values[i++], NCHARS, "{%d}", HeapTupleHeaderGetXmax(tuple->t_data));
+				values[Atnum_ismulti] = pstrdup("false");
+
+				values[Atnum_xids] = palloc(NCHARS * sizeof(char));
+				snprintf(values[Atnum_xids], NCHARS, "{%d}", xmax);
+
+				values[Atnum_modes] = palloc(NCHARS);
+				if (infomask & HEAP_XMAX_LOCK_ONLY)
+				{
+					if (HEAP_XMAX_IS_SHR_LOCKED(infomask))
+						snprintf(values[Atnum_modes], NCHARS, "{For Share}");
+					else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
+						snprintf(values[Atnum_modes], NCHARS, "{For Key Share}");
+					else if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
+						snprintf(values[Atnum_modes], NCHARS, "{For Update}");
+					else
+						/* neither keyshare nor exclusive bit it set */
+						snprintf(values[Atnum_modes], NCHARS,
+								 "{transient upgrade status}");
+				}
+				else
+				{
+					if (tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED)
+						snprintf(values[Atnum_modes], NCHARS, "{Key Update}");
+					else
+						snprintf(values[Atnum_modes], NCHARS, "{Update}");
+				}
 
-				values[i] = palloc(NCHARS * sizeof(char));
-				snprintf(values[i++], NCHARS, "{%d}", BackendXidGetPid(HeapTupleHeaderGetXmax(tuple->t_data)));
+				values[Atnum_pids] = palloc(NCHARS * sizeof(char));
+				snprintf(values[Atnum_pids], NCHARS, "{%d}",
+						 BackendXidGetPid(xmax));
 			}
 
 			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
@@ -200,10 +274,10 @@ pgrowlocks(PG_FUNCTION_ARGS)
 			/* make the tuple into a datum */
 			result = HeapTupleGetDatum(tuple);
 
-			/* Clean up */
-			for (i = 0; i < mydata->ncolumns; i++)
-				pfree(values[i]);
-			pfree(values);
+			/*
+			 * no need to pfree what we allocated; it's on a short-lived memory
+			 * context anyway
+			 */
 
 			SRF_RETURN_NEXT(funcctx, result);
 		}
diff --git a/contrib/pgrowlocks/pgrowlocks.control b/contrib/pgrowlocks/pgrowlocks.control
index a6ba164515..dfa587d761 100644
--- a/contrib/pgrowlocks/pgrowlocks.control
+++ b/contrib/pgrowlocks/pgrowlocks.control
@@ -1,5 +1,5 @@
 # pgrowlocks extension
 comment = 'show row-level locking information'
-default_version = '1.0'
+default_version = '1.1'
 module_pathname = '$libdir/pgrowlocks'
 relocatable = true
diff --git a/doc/src/sgml/pgrowlocks.sgml b/doc/src/sgml/pgrowlocks.sgml
index 390fa236d3..c7714d8877 100644
--- a/doc/src/sgml/pgrowlocks.sgml
+++ b/doc/src/sgml/pgrowlocks.sgml
@@ -43,12 +43,6 @@ pgrowlocks(text) returns setof record
       <entry><type>tid</type></entry>
       <entry>Tuple ID (TID) of locked row</entry>
      </row>
-     <row>
-      <entry><structfield>lock_type</structfield></entry>
-      <entry><type>text</type></entry>
-      <entry><literal>Shared</> for shared lock, or
-             <literal>Exclusive</> for exclusive lock</entry>
-     </row>
      <row>
       <entry><structfield>locker</structfield></entry>
       <entry><type>xid</type></entry>
@@ -64,6 +58,15 @@ pgrowlocks(text) returns setof record
       <entry><type>xid[]</type></entry>
       <entry>Transaction IDs of lockers (more than one if multitransaction)</entry>
      </row>
+     <row>
+      <entry><structfield>lock_type</structfield></entry>
+      <entry><type>text[]</type></entry>
+      <entry>Lock mode of lockers (more than one if multitransaction),
+       an array of <literal>Key Share</>, <literal>Share</>,
+       <literal>For No Key Update</>, <literal>No Key Update</>,
+       <literal>For Update</>, <literal>Update</>.</entry>
+     </row>
+
      <row>
       <entry><structfield>pids</structfield></entry>
       <entry><type>integer[]</type></entry>
diff --git a/doc/src/sgml/ref/select.sgml b/doc/src/sgml/ref/select.sgml
index 9963780c31..26d511fad8 100644
--- a/doc/src/sgml/ref/select.sgml
+++ b/doc/src/sgml/ref/select.sgml
@@ -45,7 +45,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="parameter">expression</replac
     [ LIMIT { <replaceable class="parameter">count</replaceable> | ALL } ]
     [ OFFSET <replaceable class="parameter">start</replaceable> [ ROW | ROWS ] ]
     [ FETCH { FIRST | NEXT } [ <replaceable class="parameter">count</replaceable> ] { ROW | ROWS } ONLY ]
-    [ FOR { UPDATE | SHARE } [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ] [ NOWAIT ] [...] ]
+    [ FOR { UPDATE | NO KEY UPDATE | SHARE | KEY SHARE } [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ] [ NOWAIT ] [...] ]
 
 <phrase>where <replaceable class="parameter">from_item</replaceable> can be one of:</phrase>
 
@@ -178,7 +178,8 @@ TABLE [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ]
 
     <listitem>
      <para>
-      If <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal>
+      If <literal>FOR UPDATE</>, <literal>FOR NO KEY UPDATE</literal>, <literal>FOR SHARE</literal>
+      or <literal>FOR KEY SHARE</literal>
       is specified, the
       <command>SELECT</command> statement locks the selected rows
       against concurrent updates.  (See <xref linkend="sql-for-update-share"
@@ -190,8 +191,9 @@ TABLE [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ]
 
   <para>
    You must have <literal>SELECT</literal> privilege on each column used
-   in a <command>SELECT</> command.  The use of <literal>FOR UPDATE</literal>
-   or <literal>FOR SHARE</literal> requires
+   in a <command>SELECT</> command.  The use of <literal>FOR NO KEY UPDATE</>,
+   <literal>FOR UPDATE</literal>,
+   <literal>FOR SHARE</literal> or <literal>FOR KEY SHARE</literal> requires
    <literal>UPDATE</literal> privilege as well (for at least one column
    of each table so selected).
   </para>
@@ -873,8 +875,8 @@ SELECT DISTINCT ON (location) location, time, report
 <replaceable class="parameter">select_statement</replaceable> UNION [ ALL | DISTINCT ] <replaceable class="parameter">select_statement</replaceable>
 </synopsis><replaceable class="parameter">select_statement</replaceable> is
     any <command>SELECT</command> statement without an <literal>ORDER
-    BY</>, <literal>LIMIT</>, <literal>FOR UPDATE</literal>, or
-    <literal>FOR SHARE</literal> clause.
+    BY</>, <literal>LIMIT</>, <literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</literal>,
+    <literal>FOR SHARE</literal>, or <literal>FOR KEY SHARE</literal> clause.
     (<literal>ORDER BY</> and <literal>LIMIT</> can be attached to a
     subexpression if it is enclosed in parentheses.  Without
     parentheses, these clauses will be taken to apply to the result of
@@ -910,7 +912,8 @@ SELECT DISTINCT ON (location) location, time, report
    </para>
 
    <para>
-    Currently, <literal>FOR UPDATE</> and <literal>FOR SHARE</> cannot be
+    Currently, <literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</>, <literal>FOR SHARE</> and
+    <literal>FOR KEY SHARE</> cannot be
     specified either for a <literal>UNION</> result or for any input of a
     <literal>UNION</>.
    </para>
@@ -925,8 +928,8 @@ SELECT DISTINCT ON (location) location, time, report
 <replaceable class="parameter">select_statement</replaceable> INTERSECT [ ALL | DISTINCT ] <replaceable class="parameter">select_statement</replaceable>
 </synopsis><replaceable class="parameter">select_statement</replaceable> is
     any <command>SELECT</command> statement without an <literal>ORDER
-    BY</>, <literal>LIMIT</>, <literal>FOR UPDATE</literal>, or
-    <literal>FOR SHARE</literal> clause.
+    BY</>, <literal>LIMIT</>, <literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</literal>,
+    <literal>FOR SHARE</literal>, or <literal>FOR KEY SHARE</> clause.
    </para>
 
    <para>
@@ -957,7 +960,8 @@ SELECT DISTINCT ON (location) location, time, report
    </para>
 
    <para>
-    Currently, <literal>FOR UPDATE</> and <literal>FOR SHARE</> cannot be
+    Currently, <literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</>, <literal>FOR SHARE</> and
+    <literal>FOR KEY SHARE</> cannot be
     specified either for an <literal>INTERSECT</> result or for any input of
     an <literal>INTERSECT</>.
    </para>
@@ -972,8 +976,8 @@ SELECT DISTINCT ON (location) location, time, report
 <replaceable class="parameter">select_statement</replaceable> EXCEPT [ ALL | DISTINCT ] <replaceable class="parameter">select_statement</replaceable>
 </synopsis><replaceable class="parameter">select_statement</replaceable> is
     any <command>SELECT</command> statement without an <literal>ORDER
-    BY</>, <literal>LIMIT</>, <literal>FOR UPDATE</literal>, or
-    <literal>FOR SHARE</literal> clause.
+    BY</>, <literal>LIMIT</>, <literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</literal>,
+    <literal>FOR SHARE</literal>, or <literal>FOR KEY SHARE</> clause.
    </para>
 
    <para>
@@ -1000,7 +1004,8 @@ SELECT DISTINCT ON (location) location, time, report
    </para>
 
    <para>
-    Currently, <literal>FOR UPDATE</> and <literal>FOR SHARE</> cannot be
+    Currently, <literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</>, <literal>FOR SHARE</> and
+    <literal>FOR KEY SHARE</> cannot be
     specified either for an <literal>EXCEPT</> result or for any input of
     an <literal>EXCEPT</>.
    </para>
@@ -1185,7 +1190,14 @@ FETCH { FIRST | NEXT } [ <replaceable class="parameter">count</replaceable> ] {
   </refsect2>
 
   <refsect2 id="SQL-FOR-UPDATE-SHARE">
-   <title id="sql-for-update-share-title"><literal>FOR UPDATE</literal>/<literal>FOR SHARE</literal> Clause</title>
+   <title id="sql-for-update-share-title"><literal>FOR UPDATE</>, <literal>FOR NO KEY UPDATE</>/<literal>FOR SHARE</>/<literal>FOR KEY SHARE</> Clauses</title>
+
+   <para>
+    <literal>FOR UPDATE</>, <literal>FOR NO KEY UPDATE</>, <literal>FOR SHARE</>
+    and <literal>FOR KEY SHARE</>
+    are <firstterm>locking clauses</>; they affect how <literal>SELECT</>
+    locks rows as they are obtained from the table.
+   </para>
 
    <para>
     The <literal>FOR UPDATE</literal> clause has this form:
@@ -1194,6 +1206,13 @@ FOR UPDATE [ OF <replaceable class="parameter">table_name</replaceable> [, ...]
 </synopsis>
    </para>
 
+   <para>
+    The <literal>FOR NO KEY UPDATE</literal> clause has this form:
+<synopsis>
+FOR NO KEY UPDATE [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ] [ NOWAIT ]
+</synopsis>
+   </para>
+
    <para>
     The closely related <literal>FOR SHARE</literal> clause has this form:
 <synopsis>
@@ -1201,14 +1220,31 @@ FOR SHARE [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ]
 </synopsis>
    </para>
 
+   <para>
+    Similarly, the <literal>FOR KEY SHARE</> clause has this form:
+<synopsis>
+FOR KEY SHARE [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ] [ NOWAIT ]
+</synopsis>
+   </para>
+
    <para>
     <literal>FOR UPDATE</literal> causes the rows retrieved by the
     <command>SELECT</command> statement to be locked as though for
     update.  This prevents them from being modified or deleted by
     other transactions until the current transaction ends.  That is,
     other transactions that attempt <command>UPDATE</command>,
-    <command>DELETE</command>, or <command>SELECT FOR UPDATE</command>
+    <command>DELETE</command>,
+    <command>SELECT FOR UPDATE</command>,
+    <command>SELECT FOR SHARE</command> or
+    <command>SELECT FOR KEY SHARE</command>
     of these rows will be blocked until the current transaction ends.
+    The <literal>FOR UPDATE</> lock mode
+    is also acquired by any <command>DELETE</> on a row, and also by an
+    <command>UPDATE</> that modifies the values on certain columns.  Currently,
+    the set of columns considered for the <command>UPDATE</> case are those that
+    have an unique index on them that can be used in a foreign key (so partial
+    indexes and expressional indexes are not considered), but this may change
+    in the future.
     Also, if an <command>UPDATE</command>, <command>DELETE</command>,
     or <command>SELECT FOR UPDATE</command> from another transaction
     has already locked a selected row or rows, <command>SELECT FOR
@@ -1220,13 +1256,33 @@ FOR SHARE [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ]
     linkend="mvcc">.
    </para>
 
+   <para>
+    <literal>FOR NO KEY UPDATE</> behaves similarly, except that the lock
+    acquired is weaker: this lock will not block
+    <literal>SELECT FOR KEY SHARE</> commands that attempt to acquire
+    a lock on the same rows.
+   </para>
+
    <para>
     <literal>FOR SHARE</literal> behaves similarly, except that it
     acquires a shared rather than exclusive lock on each retrieved
     row.  A shared lock blocks other transactions from performing
     <command>UPDATE</command>, <command>DELETE</command>, or <command>SELECT
     FOR UPDATE</command> on these rows, but it does not prevent them
-    from performing <command>SELECT FOR SHARE</command>.
+    from performing <command>SELECT FOR SHARE</command> or
+    <command>SELECT FOR KEY SHARE</command>.
+   </para>
+
+   <para>
+    <literal>FOR KEY SHARE</> behaves similarly to <literal>FOR SHARE</literal>,
+    except that the lock
+    is weaker: <literal>SELECT FOR UPDATE</> is blocked, but
+    not <literal>SELECT FOR NO KEY UPDATE</>.  A key-shared
+    lock blocks other transactions from performing <command>DELETE</command>
+    or any <command>UPDATE</command> that changes the key values, but not
+    other <command>UPDATE</>, and neither it does prevent
+    <command>SELECT FOR UPDATE</>, <command>SELECT FOR SHARE</>, or
+    <command>SELECT FOR KEY SHARE</>.
    </para>
 
    <para>
@@ -1243,41 +1299,39 @@ FOR SHARE [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ]
    </para>
 
    <para>
-    If specific tables are named in <literal>FOR UPDATE</literal>
-    or <literal>FOR SHARE</literal>,
+    If specific tables are named in a locking clause,
     then only rows coming from those tables are locked; any other
     tables used in the <command>SELECT</command> are simply read as
-    usual.  A <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal>
+    usual.  A locking
     clause without a table list affects all tables used in the statement.
-    If <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal> is
+    If a locking clause is
     applied to a view or sub-query, it affects all tables used in
     the view or sub-query.
-    However, <literal>FOR UPDATE</literal>/<literal>FOR SHARE</literal>
+    However, these clauses
     do not apply to <literal>WITH</> queries referenced by the primary query.
     If you want row locking to occur within a <literal>WITH</> query, specify
-    <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal> within the
-    <literal>WITH</> query.
+    a locking clause within the <literal>WITH</> query.
    </para>
 
    <para>
-    Multiple <literal>FOR UPDATE</literal> and <literal>FOR SHARE</literal>
+    Multiple locking
     clauses can be written if it is necessary to specify different locking
     behavior for different tables.  If the same table is mentioned (or
-    implicitly affected) by both <literal>FOR UPDATE</literal> and
-    <literal>FOR SHARE</literal> clauses, then it is processed as
-    <literal>FOR UPDATE</literal>.  Similarly, a table is processed
+    implicitly affected) by more than one locking clause,
+    then it is processed as if it was only specified by the strongest one.
+    Similarly, a table is processed
     as <literal>NOWAIT</> if that is specified in any of the clauses
     affecting it.
    </para>
 
    <para>
-    <literal>FOR UPDATE</literal> and <literal>FOR SHARE</literal> cannot be
+    The locking clauses cannot be
     used in contexts where returned rows cannot be clearly identified with
     individual table rows; for example they cannot be used with aggregation.
    </para>
 
    <para>
-    When <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal>
+    When a locking clause
     appears at the top level of a <command>SELECT</> query, the rows that
     are locked are exactly those that are returned by the query; in the
     case of a join query, the rows locked are those that contribute to
@@ -1288,13 +1342,13 @@ FOR SHARE [ OF <replaceable class="parameter">table_name</replaceable> [, ...] ]
     <literal>LIMIT</> is used, locking stops
     once enough rows have been returned to satisfy the limit (but note that
     rows skipped over by <literal>OFFSET</> will get locked).  Similarly,
-    if <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal>
+    if a locking clause
     is used in a cursor's query, only rows actually fetched or stepped past
     by the cursor will be locked.
    </para>
 
    <para>
-    When <literal>FOR UPDATE</literal> or <literal>FOR SHARE</literal>
+    When a locking clause
     appears in a sub-<command>SELECT</>, the rows locked are those
     returned to the outer query by the sub-query.  This might involve
     fewer rows than inspection of the sub-query alone would suggest,
@@ -1307,11 +1361,9 @@ SELECT * FROM (SELECT * FROM mytable FOR UPDATE) ss WHERE col1 = 5;
     condition is not textually within the sub-query.
    </para>
 
-  <caution>
-   <para>
-    Avoid locking a row and then modifying it within a later savepoint or
-    <application>PL/pgSQL</application> exception block.  A subsequent
-    rollback would cause the lock to be lost.  For example:
+  <para>
+   Previous releases failed to preserve a lock which is upgraded by a later
+   savepoint.  For example, this code:
 <programlisting>
 BEGIN;
 SELECT * FROM mytable WHERE key = 1 FOR UPDATE;
@@ -1319,23 +1371,15 @@ SAVEPOINT s;
 UPDATE mytable SET ... WHERE key = 1;
 ROLLBACK TO s;
 </programlisting>
-    After the <command>ROLLBACK</>, the row is effectively unlocked, rather
-    than returned to its pre-savepoint state of being locked but not modified.
-    This hazard occurs if a row locked in the current transaction is updated
-    or deleted, or if a shared lock is upgraded to exclusive: in all these
-    cases, the former lock state is forgotten.  If the transaction is then
-    rolled back to a state between the original locking command and the
-    subsequent change, the row will appear not to be locked at all.  This is
-    an implementation deficiency which will be addressed in a future release
-    of <productname>PostgreSQL</productname>.
-   </para>
-  </caution>
+   would fail to preserve the <literal>FOR UPDATE</> lock after the
+   <command>ROLLBACK</>.  This has been fixed in release 9.2.
+  </para>
 
   <caution>
    <para>
     It is possible for a <command>SELECT</> command running at the <literal>READ
     COMMITTED</literal> transaction isolation level and using <literal>ORDER
-    BY</literal> and <literal>FOR UPDATE/SHARE</literal> to return rows out of
+    BY</literal> and a locking clause to return rows out of
     order.  This is because <literal>ORDER BY</> is applied first.
     The command sorts the result, but might then block trying to obtain a lock
     on one or more of the rows.  Once the <literal>SELECT</> unblocks, some
@@ -1765,14 +1809,16 @@ SELECT distributors.* WHERE distributors.name = 'Westward';
   </refsect2>
 
   <refsect2>
-   <title><literal>FOR UPDATE</> and <literal>FOR SHARE</></title>
+   <title><literal>FOR NO KEY UPDATE</>, <literal>FOR UPDATE</>, <literal>FOR SHARE</>, <literal>FOR KEY SHARE</></title>
 
    <para>
     Although <literal>FOR UPDATE</> appears in the SQL standard, the
     standard allows it only as an option of <command>DECLARE CURSOR</>.
     <productname>PostgreSQL</productname> allows it in any <command>SELECT</>
     query as well as in sub-<command>SELECT</>s, but this is an extension.
-    The <literal>FOR SHARE</> variant, and the <literal>NOWAIT</> option,
+    The <literal>FOR NO KEY UPDATE</>, <literal>FOR SHARE</> and
+    <literal>FOR KEY SHARE</> variants,
+    as well as the <literal>NOWAIT</> option,
     do not appear in the standard.
    </para>
   </refsect2>
diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
index 0706e3afc2..e39b9770cb 100644
--- a/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@ -542,7 +542,7 @@ heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
 			result = TransactionIdGetDatum(HeapTupleHeaderGetXmin(tup->t_data));
 			break;
 		case MaxTransactionIdAttributeNumber:
-			result = TransactionIdGetDatum(HeapTupleHeaderGetXmax(tup->t_data));
+			result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmax(tup->t_data));
 			break;
 		case MinCommandIdAttributeNumber:
 		case MaxCommandIdAttributeNumber:
diff --git a/src/backend/access/heap/README.tuplock b/src/backend/access/heap/README.tuplock
new file mode 100644
index 0000000000..8d5cc167c8
--- /dev/null
+++ b/src/backend/access/heap/README.tuplock
@@ -0,0 +1,139 @@
+Locking tuples
+--------------
+
+Locking tuples is not as easy as locking tables or other database objects.
+The problem is that transactions might want to lock large numbers of tuples at
+any one time, so it's not possible to keep the locks objects in shared memory.
+To work around this limitation, we use a two-level mechanism.  The first level
+is implemented by storing locking information in the tuple header: a tuple is
+marked as locked by setting the current transaction's XID as its XMAX, and
+setting additional infomask bits to distinguish this case from the more normal
+case of having deleted the tuple.  When multiple transactions concurrently
+lock a tuple, a MultiXact is used; see below.  This mechanism can accomodate
+arbitrarily large numbers of tuples being locked simultaneously.
+
+When it is necessary to wait for a tuple-level lock to be released, the basic
+delay is provided by XactLockTableWait or MultiXactIdWait on the contents of
+the tuple's XMAX.  However, that mechanism will release all waiters
+concurrently, so there would be a race condition as to which waiter gets the
+tuple, potentially leading to indefinite starvation of some waiters.  The
+possibility of share-locking makes the problem much worse --- a steady stream
+of share-lockers can easily block an exclusive locker forever.  To provide
+more reliable semantics about who gets a tuple-level lock first, we use the
+standard lock manager, which implements the second level mentioned above.  The
+protocol for waiting for a tuple-level lock is really
+
+     LockTuple()
+     XactLockTableWait()
+     mark tuple as locked by me
+     UnlockTuple()
+
+When there are multiple waiters, arbitration of who is to get the lock next
+is provided by LockTuple().  However, at most one tuple-level lock will
+be held or awaited per backend at any time, so we don't risk overflow
+of the lock table.  Note that incoming share-lockers are required to
+do LockTuple as well, if there is any conflict, to ensure that they don't
+starve out waiting exclusive-lockers.  However, if there is not any active
+conflict for a tuple, we don't incur any extra overhead.
+
+We provide four levels of tuple locking strength: SELECT FOR KEY UPDATE is
+super-exclusive locking (used to delete tuples and more generally to update
+tuples modifying the values of the columns that make up the key of the tuple);
+SELECT FOR UPDATE is a standards-compliant exclusive lock; SELECT FOR SHARE
+implements shared locks; and finally SELECT FOR KEY SHARE is a super-weak mode
+that does not conflict with exclusive mode, but conflicts with SELECT FOR KEY
+UPDATE.  This last mode implements a mode just strong enough to implement RI
+checks, i.e. it ensures that tuples do not go away from under a check, without
+blocking when some other transaction that want to update the tuple without
+changing its key.
+
+The conflict table is:
+
+                KEY UPDATE        UPDATE        SHARE        KEY SHARE
+KEY UPDATE       conflict        conflict      conflict      conflict
+UPDATE           conflict        conflict      conflict
+SHARE            conflict        conflict
+KEY SHARE        conflict
+
+When there is a single locker in a tuple, we can just store the locking info
+in the tuple itself.  We do this by storing the locker's Xid in XMAX, and
+setting infomask bits specifying the locking strength.  There is one exception
+here: since infomask space is limited, we do not provide a separate bit
+for SELECT FOR SHARE, so we have to use the extended info in a MultiXact in
+that case.  (The other cases, SELECT FOR UPDATE and SELECT FOR KEY SHARE, are
+presumably more commonly used due to being the standards-mandated locking
+mechanism, or heavily used by the RI code, so we want to provide fast paths
+for those.)
+
+MultiXacts
+----------
+
+A tuple header provides very limited space for storing information about tuple
+locking and updates: there is room only for a single Xid and a small number of
+infomask bits.  Whenever we need to store more than one lock, we replace the
+first locker's Xid with a new MultiXactId.  Each MultiXact provides extended
+locking data; it comprises an array of Xids plus some flags bits for each one.
+The flags are currently used to store the locking strength of each member
+transaction.  (The flags also distinguish a pure locker from an updater.)
+
+In earlier PostgreSQL releases, a MultiXact always meant that the tuple was
+locked in shared mode by multiple transactions.  This is no longer the case; a
+MultiXact may contain an update or delete Xid.  (Keep in mind that tuple locks
+in a transaction do not conflict with other tuple locks in the same
+transaction, so it's possible to have otherwise conflicting locks in a
+MultiXact if they belong to the same transaction).
+
+Note that each lock is attributed to the subtransaction that acquires it.
+This means that a subtransaction that aborts is seen as though it releases the
+locks it acquired; concurrent transactions can then proceed without having to
+wait for the main transaction to finish.  It also means that a subtransaction
+can upgrade to a stronger lock level than an earlier transaction had, and if
+the subxact aborts, the earlier, weaker lock is kept.
+
+The possibility of having an update within a MultiXact means that they must
+persist across crashes and restarts: a future reader of the tuple needs to
+figure out whether the update committed or aborted.  So we have a requirement
+that pg_multixact needs to retain pages of its data until we're certain that
+the MultiXacts in them are no longer of interest.
+
+VACUUM is in charge of removing old MultiXacts at the time of tuple freezing.
+This works in the same way that pg_clog segments are removed: we have a
+pg_class column that stores the earliest multixact that could possibly be
+stored in the table; the minimum of all such values is stored in a pg_database
+column.  VACUUM computes the minimum across all pg_database values, and
+removes pg_multixact segments older than the minimum.
+
+Infomask Bits
+-------------
+
+The following infomask bits are applicable:
+
+- HEAP_XMAX_INVALID
+  Any tuple with this bit set does not have a valid value stored in XMAX.
+
+- HEAP_XMAX_IS_MULTI
+  This bit is set if the tuple's Xmax is a MultiXactId (as opposed to a
+  regular TransactionId).
+
+- HEAP_XMAX_LOCK_ONLY
+  This bit is set when the XMAX is a locker only; that is, if it's a
+  multixact, it does not contain an update among its members.  It's set when
+  the XMAX is a plain Xid that locked the tuple, as well.
+
+- HEAP_XMAX_KEYSHR_LOCK
+- HEAP_XMAX_EXCL_LOCK
+  These bits indicate the strength of the lock acquired; they are useful when
+  the XMAX is not a MultiXactId.  If it's a multi, the info is to be found in
+  the member flags.  If HEAP_XMAX_IS_MULTI is not set and HEAP_XMAX_LOCK_ONLY
+  is set, then one of these *must* be set as well.
+  Note there is no infomask bit for a SELECT FOR SHARE lock.  Also there is no
+  separate bit for a SELECT FOR KEY UPDATE lock; this is implemented by the
+  HEAP_KEYS_UPDATED bit.
+
+- HEAP_KEYS_UPDATED
+  This bit lives in t_infomask2.  If set, indicates that the XMAX updated
+  this tuple and changed the key values, or it deleted the tuple.
+  It's set regardless of whether the XMAX is a TransactionId or a MultiXactId.
+
+We currently never set the HEAP_XMAX_COMMITTED when the HEAP_XMAX_IS_MULTI bit
+is set.
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index b19d1cf6c5..57d47e8601 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -84,12 +84,105 @@ static HeapScanDesc heap_beginscan_internal(Relation relation,
 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
 					TransactionId xid, CommandId cid, int options);
 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
-				ItemPointerData from, Buffer newbuf, HeapTuple newtup,
-				bool all_visible_cleared, bool new_all_visible_cleared);
-static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
-					   HeapTuple oldtup, HeapTuple newtup);
+				Buffer newbuf, HeapTuple oldtup,
+				HeapTuple newtup, bool all_visible_cleared,
+				bool new_all_visible_cleared);
+static void HeapSatisfiesHOTandKeyUpdate(Relation relation,
+							 Bitmapset *hot_attrs, Bitmapset *key_attrs,
+							 bool *satisfies_hot, bool *satisfies_key,
+							 HeapTuple oldtup, HeapTuple newtup);
+static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
+						  uint16 old_infomask2, TransactionId add_to_xmax,
+						  LockTupleMode mode, bool is_update,
+						  TransactionId *result_xmax, uint16 *result_infomask,
+						  uint16 *result_infomask2);
+static HTSU_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
+						ItemPointer ctid, TransactionId xid,
+						LockTupleMode mode);
+static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
+					   uint16 *new_infomask2);
+static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
+						uint16 t_infomask);
+static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+				int *remaining, uint16 infomask);
+static bool ConditionalMultiXactIdWait(MultiXactId multi,
+						   MultiXactStatus status, int *remaining,
+						   uint16 infomask);
 
 
+/*
+ * Each tuple lock mode has a corresponding heavyweight lock, and one or two
+ * corresponding MultiXactStatuses (one to merely lock tuples, another one to
+ * update them).  This table (and the macros below) helps us determine the
+ * heavyweight lock mode and MultiXactStatus values to use for any particular
+ * tuple lock strength.
+ */
+static const struct
+{
+	LOCKMODE	hwlock;
+	MultiXactStatus	lockstatus;
+	MultiXactStatus	updstatus;
+}
+tupleLockExtraInfo[MaxLockTupleMode + 1] =
+{
+	{	/* LockTupleKeyShare */
+		AccessShareLock,
+		MultiXactStatusForKeyShare,
+		-1	/* KeyShare does not allow updating tuples */
+	},
+	{	/* LockTupleShare */
+		RowShareLock,
+		MultiXactStatusForShare,
+		-1	/* Share does not allow updating tuples */
+	},
+	{	/* LockTupleNoKeyExclusive */
+		ExclusiveLock,
+		MultiXactStatusForNoKeyUpdate,
+		MultiXactStatusNoKeyUpdate
+	},
+	{	/* LockTupleExclusive */
+		AccessExclusiveLock,
+		MultiXactStatusForUpdate,
+		MultiXactStatusUpdate
+	}
+};
+/* Get the LOCKMODE for a given MultiXactStatus */
+#define LOCKMODE_from_mxstatus(status) \
+			(tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
+
+/*
+ * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
+ * This is more readable than having every caller translate it to lock.h's
+ * LOCKMODE.
+ */
+#define LockTupleTuplock(rel, tup, mode) \
+	LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
+#define UnlockTupleTuplock(rel, tup, mode) \
+	UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
+#define ConditionalLockTupleTuplock(rel, tup, mode) \
+	ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
+
+/*
+ * This table maps tuple lock strength values for each particular
+ * MultiXactStatus value.
+ */
+static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
+{
+	LockTupleKeyShare,		/* ForKeyShare */
+	LockTupleShare,			/* ForShare */
+	LockTupleNoKeyExclusive,		/* ForNoKeyUpdate */
+	LockTupleExclusive,		/* ForUpdate */
+	LockTupleNoKeyExclusive,		/* NoKeyUpdate */
+	LockTupleExclusive		/* Update */
+};
+
+/* Get the LockTupleMode for a given MultiXactStatus */
+#define TUPLOCK_from_mxstatus(status) \
+			(MultiXactStatusLock[(status)])
+/* Get the is_update bit for a given MultiXactStatus */
+#define ISUPDATE_from_mxstatus(status) \
+			((status) > MultiXactStatusForUpdate)
+
 /* ----------------------------------------------------------------
  *						 heap support routines
  * ----------------------------------------------------------------
@@ -1664,7 +1757,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 				   ItemPointerGetBlockNumber(tid));
 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
 			at_chain_start = false;
-			prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+			prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
 		}
 		else
 			break;				/* end of chain */
@@ -1787,7 +1880,7 @@ heap_get_latest_tid(Relation relation,
 		 * tuple.  Check for XMIN match.
 		 */
 		if (TransactionIdIsValid(priorXmax) &&
-		  !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
+			!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
 		{
 			UnlockReleaseBuffer(buffer);
 			break;
@@ -1805,7 +1898,8 @@ heap_get_latest_tid(Relation relation,
 		/*
 		 * If there's a valid t_ctid link, follow it, else we're done.
 		 */
-		if ((tp.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) ||
+		if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
+			HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
 			ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
 		{
 			UnlockReleaseBuffer(buffer);
@@ -1813,7 +1907,7 @@ heap_get_latest_tid(Relation relation,
 		}
 
 		ctid = tp.t_data->t_ctid;
-		priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
+		priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
 		UnlockReleaseBuffer(buffer);
 	}							/* end of loop */
 }
@@ -1826,17 +1920,25 @@ heap_get_latest_tid(Relation relation,
  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
  * be set on exit.	If the transaction committed, we set the XMAX_COMMITTED
  * hint bit if possible --- but beware that that may not yet be possible,
- * if the transaction committed asynchronously.  Hence callers should look
- * only at XMAX_INVALID.
+ * if the transaction committed asynchronously.
+ *
+ * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
+ * even if it commits.
+ *
+ * Hence callers should look only at XMAX_INVALID.
+ *
+ * Note this is not allowed for tuples whose xmax is a multixact.
  */
 static void
 UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
 {
-	Assert(TransactionIdEquals(HeapTupleHeaderGetXmax(tuple), xid));
+	Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
+	Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
 
 	if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
 	{
-		if (TransactionIdDidCommit(xid))
+		if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
+			TransactionIdDidCommit(xid))
 			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
 								 xid);
 		else
@@ -2373,6 +2475,26 @@ simple_heap_insert(Relation relation, HeapTuple tup)
 	return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
 }
 
+/*
+ * Given infomask/infomask2, compute the bits that must be saved in the
+ * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
+ * xl_heap_lock_updated WAL records.
+ *
+ * See fix_infomask_from_infobits.
+ */
+static uint8
+compute_infobits(uint16 infomask, uint16 infomask2)
+{
+	return
+		((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
+		((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
+		((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
+		/* note we ignore HEAP_XMAX_SHR_LOCK here */
+		((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
+		((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
+		 XLHL_KEYS_UPDATED : 0);
+}
+
 /*
  *	heap_delete - delete a tuple
  *
@@ -2393,7 +2515,8 @@ simple_heap_insert(Relation relation, HeapTuple tup)
  * (the last only possible if wait == false).
  *
  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
- * t_xmax, and t_cmax (the last only for HeapTupleSelfUpdated, since we
+ * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
+ * (the last only for HeapTupleSelfUpdated, since we
  * cannot obtain cmax from a combocid generated by another transaction).
  * See comments for struct HeapUpdateFailureData for additional info.
  */
@@ -2410,6 +2533,9 @@ heap_delete(Relation relation, ItemPointer tid,
 	BlockNumber block;
 	Buffer		buffer;
 	Buffer		vmbuffer = InvalidBuffer;
+	TransactionId new_xmax;
+	uint16		new_infomask,
+				new_infomask2;
 	bool		have_tuple_lock = false;
 	bool		iscombo;
 	bool		all_visible_cleared = false;
@@ -2465,7 +2591,7 @@ l1:
 		uint16		infomask;
 
 		/* must copy state data before unlocking buffer */
-		xwait = HeapTupleHeaderGetXmax(tp.t_data);
+		xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
 		infomask = tp.t_data->t_infomask;
 
 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
@@ -2481,20 +2607,20 @@ l1:
 		 */
 		if (!have_tuple_lock)
 		{
-			LockTuple(relation, &(tp.t_self), ExclusiveLock);
+			LockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
 			have_tuple_lock = true;
 		}
 
 		/*
 		 * Sleep until concurrent transaction ends.  Note that we don't care
-		 * if the locker has an exclusive or shared lock, because we need
-		 * exclusive.
+		 * which lock mode the locker has, because we need the strongest one.
 		 */
 
 		if (infomask & HEAP_XMAX_IS_MULTI)
 		{
 			/* wait for multixact */
-			MultiXactIdWait((MultiXactId) xwait);
+			MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate,
+							NULL, infomask);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -2503,7 +2629,7 @@ l1:
 			 * change, and start over if so.
 			 */
 			if (!(tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-				!TransactionIdEquals(HeapTupleHeaderGetXmax(tp.t_data),
+				!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
 									 xwait))
 				goto l1;
 
@@ -2529,7 +2655,7 @@ l1:
 			 * Check for xmax change, and start over if so.
 			 */
 			if ((tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-				!TransactionIdEquals(HeapTupleHeaderGetXmax(tp.t_data),
+				!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
 									 xwait))
 				goto l1;
 
@@ -2541,8 +2667,9 @@ l1:
 		 * We may overwrite if previous xmax aborted, or if it committed but
 		 * only locked the tuple without updating it.
 		 */
-		if (tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
-									 HEAP_IS_LOCKED))
+		if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
+			HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
+			HeapTupleHeaderIsOnlyLocked(tp.t_data))
 			result = HeapTupleMayBeUpdated;
 		else
 			result = HeapTupleUpdated;
@@ -2562,14 +2689,14 @@ l1:
 			   result == HeapTupleBeingUpdated);
 		Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
 		hufd->ctid = tp.t_data->t_ctid;
-		hufd->xmax = HeapTupleHeaderGetXmax(tp.t_data);
+		hufd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
 		if (result == HeapTupleSelfUpdated)
 			hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
 		else
 			hufd->cmax = 0;		/* for lack of an InvalidCommandId value */
 		UnlockReleaseBuffer(buffer);
 		if (have_tuple_lock)
-			UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
+			UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
 		if (vmbuffer != InvalidBuffer)
 			ReleaseBuffer(vmbuffer);
 		return result;
@@ -2603,14 +2730,29 @@ l1:
 							vmbuffer);
 	}
 
+	/*
+	 * If this is the first possibly-multixact-able operation in the
+	 * current transaction, set my per-backend OldestMemberMXactId setting.
+	 * We can be certain that the transaction will never become a member of
+	 * any older MultiXactIds than that.  (We have to do this even if we
+	 * end up just using our own TransactionId below, since some other
+	 * backend could incorporate our XID into a MultiXact immediately
+	 * afterwards.)
+	 */
+	MultiXactIdSetOldestMember();
+
+	compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
+							  tp.t_data->t_infomask, tp.t_data->t_infomask2,
+							  xid, LockTupleExclusive, true,
+							  &new_xmax, &new_infomask, &new_infomask2);
+
 	/* store transaction information of xact deleting the tuple */
-	tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
-							   HEAP_XMAX_INVALID |
-							   HEAP_XMAX_IS_MULTI |
-							   HEAP_IS_LOCKED |
-							   HEAP_MOVED);
+	tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+	tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+	tp.t_data->t_infomask |= new_infomask;
+	tp.t_data->t_infomask2 |= new_infomask2;
 	HeapTupleHeaderClearHotUpdated(tp.t_data);
-	HeapTupleHeaderSetXmax(tp.t_data, xid);
+	HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
 	HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
 	/* Make sure there is no forward chain link in t_ctid */
 	tp.t_data->t_ctid = tp.t_self;
@@ -2625,8 +2767,11 @@ l1:
 		XLogRecData rdata[2];
 
 		xlrec.all_visible_cleared = all_visible_cleared;
+		xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
+											  tp.t_data->t_infomask2);
 		xlrec.target.node = relation->rd_node;
 		xlrec.target.tid = tp.t_self;
+		xlrec.xmax = new_xmax;
 		rdata[0].data = (char *) &xlrec;
 		rdata[0].len = SizeOfHeapDelete;
 		rdata[0].buffer = InvalidBuffer;
@@ -2679,7 +2824,7 @@ l1:
 	 * Release the lmgr tuple lock, if we had it.
 	 */
 	if (have_tuple_lock)
-		UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
+		UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
 
 	pgstat_count_heap_delete(relation);
 
@@ -2739,6 +2884,7 @@ simple_heap_delete(Relation relation, ItemPointer tid)
  *	crosscheck - if not InvalidSnapshot, also check old tuple against this
  *	wait - true if should wait for any conflicting update to commit/abort
  *	hufd - output parameter, filled in failure cases (see below)
+ *	lockmode - output parameter, filled with lock mode acquired on tuple
  *
  * Normal, successful return value is HeapTupleMayBeUpdated, which
  * actually means we *did* update it.  Failure return codes are
@@ -2752,23 +2898,26 @@ simple_heap_delete(Relation relation, ItemPointer tid)
  * data are not reflected into *newtup.
  *
  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
- * t_xmax, and t_cmax (the last only for HeapTupleSelfUpdated, since we
+ * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
+ * (the last only for HeapTupleSelfUpdated, since we
  * cannot obtain cmax from a combocid generated by another transaction).
  * See comments for struct HeapUpdateFailureData for additional info.
  */
 HTSU_Result
 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 			CommandId cid, Snapshot crosscheck, bool wait,
-			HeapUpdateFailureData *hufd)
+			HeapUpdateFailureData *hufd, LockTupleMode *lockmode)
 {
 	HTSU_Result result;
 	TransactionId xid = GetCurrentTransactionId();
 	Bitmapset  *hot_attrs;
+	Bitmapset  *key_attrs;
 	ItemId		lp;
 	HeapTupleData oldtup;
 	HeapTuple	heaptup;
 	Page		page;
 	BlockNumber block;
+	MultiXactStatus mxact_status;
 	Buffer		buffer,
 				newbuf,
 				vmbuffer = InvalidBuffer,
@@ -2779,9 +2928,20 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 				pagefree;
 	bool		have_tuple_lock = false;
 	bool		iscombo;
+	bool		satisfies_hot;
+	bool		satisfies_key;
 	bool		use_hot_update = false;
+	bool		key_intact;
 	bool		all_visible_cleared = false;
 	bool		all_visible_cleared_new = false;
+	bool		checked_lockers;
+	bool		locker_remains;
+	TransactionId xmax_new_tuple,
+				  xmax_old_tuple;
+	uint16		infomask_old_tuple,
+				infomask2_old_tuple,
+				infomask_new_tuple,
+				infomask2_new_tuple;
 
 	Assert(ItemPointerIsValid(otid));
 
@@ -2797,7 +2957,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	 * Note that we get a copy here, so we need not worry about relcache flush
 	 * happening midway through.
 	 */
-	hot_attrs = RelationGetIndexAttrBitmap(relation);
+	hot_attrs = RelationGetIndexAttrBitmap(relation, false);
+	key_attrs = RelationGetIndexAttrBitmap(relation, true);
 
 	block = ItemPointerGetBlockNumber(otid);
 	buffer = ReadBuffer(relation, block);
@@ -2821,6 +2982,44 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	oldtup.t_len = ItemIdGetLength(lp);
 	oldtup.t_self = *otid;
 
+	/*
+	 * If we're not updating any "key" column, we can grab a weaker lock type.
+	 * This allows for more concurrency when we are running simultaneously with
+	 * foreign key checks.
+	 *
+	 * Note that if a column gets detoasted while executing the update, but the
+	 * value ends up being the same, this test will fail and we will use the
+	 * stronger lock.  This is acceptable; the important case to optimize is
+	 * updates that don't manipulate key columns, not those that
+	 * serendipitiously arrive at the same key values.
+	 */
+	HeapSatisfiesHOTandKeyUpdate(relation, hot_attrs, key_attrs,
+								 &satisfies_hot, &satisfies_key,
+								 &oldtup, newtup);
+	if (satisfies_key)
+	{
+		*lockmode = LockTupleNoKeyExclusive;
+		mxact_status = MultiXactStatusNoKeyUpdate;
+		key_intact = true;
+
+		/*
+		 * If this is the first possibly-multixact-able operation in the
+		 * current transaction, set my per-backend OldestMemberMXactId setting.
+		 * We can be certain that the transaction will never become a member of
+		 * any older MultiXactIds than that.  (We have to do this even if we
+		 * end up just using our own TransactionId below, since some other
+		 * backend could incorporate our XID into a MultiXact immediately
+		 * afterwards.)
+		 */
+		MultiXactIdSetOldestMember();
+	}
+	else
+	{
+		*lockmode = LockTupleExclusive;
+		mxact_status = MultiXactStatusUpdate;
+		key_intact = false;
+	}
+
 	/*
 	 * Note: beyond this point, use oldtup not otid to refer to old tuple.
 	 * otid may very well point at newtup->t_self, which we will overwrite
@@ -2829,8 +3028,13 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	 */
 
 l2:
+	checked_lockers = false;
+	locker_remains = false;
 	result = HeapTupleSatisfiesUpdate(oldtup.t_data, cid, buffer);
 
+	/* see below about the "no wait" case */
+	Assert(result != HeapTupleBeingUpdated || wait);
+
 	if (result == HeapTupleInvisible)
 	{
 		UnlockReleaseBuffer(buffer);
@@ -2838,11 +3042,26 @@ l2:
 	}
 	else if (result == HeapTupleBeingUpdated && wait)
 	{
-		TransactionId xwait;
+		TransactionId	xwait;
 		uint16		infomask;
+		bool		can_continue = false;
+
+		checked_lockers = true;
+
+		/*
+		 * XXX note that we don't consider the "no wait" case here.  This
+		 * isn't a problem currently because no caller uses that case, but it
+		 * should be fixed if such a caller is introduced.  It wasn't a problem
+		 * previously because this code would always wait, but now that some
+		 * tuple locks do not conflict with one of the lock modes we use, it is
+		 * possible that this case is interesting to handle specially.
+		 *
+		 * This may cause failures with third-party code that calls heap_update
+		 * directly.
+		 */
 
 		/* must copy state data before unlocking buffer */
-		xwait = HeapTupleHeaderGetXmax(oldtup.t_data);
+		xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
 		infomask = oldtup.t_data->t_infomask;
 
 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
@@ -2858,20 +3077,29 @@ l2:
 		 */
 		if (!have_tuple_lock)
 		{
-			LockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+			LockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
 			have_tuple_lock = true;
 		}
 
 		/*
-		 * Sleep until concurrent transaction ends.  Note that we don't care
-		 * if the locker has an exclusive or shared lock, because we need
-		 * exclusive.
+		 * Now we have to do something about the existing locker.  If it's a
+		 * multi, sleep on it; we might be awakened before it is completely
+		 * gone (or even not sleep at all in some cases); we need to preserve
+		 * it as locker, unless it is gone completely.
+		 *
+		 * If it's not a multi, we need to check for sleeping conditions before
+		 * actually going to sleep.  If the update doesn't conflict with the
+		 * locks, we just continue without sleeping (but making sure it is
+		 * preserved).
 		 */
-
 		if (infomask & HEAP_XMAX_IS_MULTI)
 		{
+			TransactionId	update_xact;
+			int				remain;
+
 			/* wait for multixact */
-			MultiXactIdWait((MultiXactId) xwait);
+			MultiXactIdWait((MultiXactId) xwait, mxact_status, &remain,
+							infomask);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 			/*
@@ -2880,49 +3108,87 @@ l2:
 			 * change, and start over if so.
 			 */
 			if (!(oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-				!TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
+				!TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
 									 xwait))
 				goto l2;
 
 			/*
-			 * You might think the multixact is necessarily done here, but not
-			 * so: it could have surviving members, namely our own xact or
-			 * other subxacts of this backend.	It is legal for us to update
-			 * the tuple in either case, however (the latter case is
-			 * essentially a situation of upgrading our former shared lock to
-			 * exclusive).	We don't bother changing the on-disk hint bits
-			 * since we are about to overwrite the xmax altogether.
+			 * Note that the multixact may not be done by now.  It could have
+			 * surviving members; our own xact or other subxacts of this
+			 * backend, and also any other concurrent transaction that locked
+			 * the tuple with KeyShare if we only got TupleLockUpdate.  If this
+			 * is the case, we have to be careful to mark the updated tuple
+			 * with the surviving members in Xmax.
+			 *
+			 * Note that there could have been another update in the MultiXact.
+			 * In that case, we need to check whether it committed or aborted.
+			 * If it aborted we are safe to update it again; otherwise there is
+			 * an update conflict, and we have to return HeapTupleUpdated
+			 * below.
+			 *
+			 * In the LockTupleExclusive case, we still need to preserve the
+			 * surviving members: those would include the tuple locks we had
+			 * before this one, which are important to keep in case this
+			 * subxact aborts.
 			 */
+			update_xact = InvalidTransactionId;
+			if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
+				update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
+
+			/* there was no UPDATE in the MultiXact; or it aborted. */
+			if (!TransactionIdIsValid(update_xact) ||
+				TransactionIdDidAbort(update_xact))
+				can_continue = true;
+
+			locker_remains = remain != 0;
 		}
 		else
 		{
-			/* wait for regular transaction to end */
-			XactLockTableWait(xwait);
-			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-
 			/*
-			 * xwait is done, but if xwait had just locked the tuple then some
-			 * other xact could update this tuple before we get to this point.
-			 * Check for xmax change, and start over if so.
+			 * If it's just a key-share locker, and we're not changing the
+			 * key columns, we don't need to wait for it to end; but we
+			 * need to preserve it as locker.
 			 */
-			if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-				!TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
-									 xwait))
-				goto l2;
+			if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
+			{
+				LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
-			/* Otherwise check if it committed or aborted */
-			UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
+				/*
+				 * recheck the locker; if someone else changed the tuple while we
+				 * weren't looking, start over.
+				 */
+				if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
+										 xwait))
+					goto l2;
+
+				can_continue = true;
+				locker_remains = true;
+			}
+			else
+			{
+				/* wait for regular transaction to end */
+				XactLockTableWait(xwait);
+				LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+				/*
+				 * xwait is done, but if xwait had just locked the tuple then some
+				 * other xact could update this tuple before we get to this point.
+				 * Check for xmax change, and start over if so.
+				 */
+				if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
+										 xwait))
+					goto l2;
+
+				/* Otherwise check if it committed or aborted */
+				UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
+				if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
+					can_continue = true;
+			}
 		}
 
-		/*
-		 * We may overwrite if previous xmax aborted, or if it committed but
-		 * only locked the tuple without updating it.
-		 */
-		if (oldtup.t_data->t_infomask & (HEAP_XMAX_INVALID |
-										 HEAP_IS_LOCKED))
-			result = HeapTupleMayBeUpdated;
-		else
-			result = HeapTupleUpdated;
+		result = can_continue ? HeapTupleMayBeUpdated : HeapTupleUpdated;
 	}
 
 	if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
@@ -2939,17 +3205,18 @@ l2:
 			   result == HeapTupleBeingUpdated);
 		Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
 		hufd->ctid = oldtup.t_data->t_ctid;
-		hufd->xmax = HeapTupleHeaderGetXmax(oldtup.t_data);
+		hufd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
 		if (result == HeapTupleSelfUpdated)
 			hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
 		else
 			hufd->cmax = 0;		/* for lack of an InvalidCommandId value */
 		UnlockReleaseBuffer(buffer);
 		if (have_tuple_lock)
-			UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+			UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
 		if (vmbuffer != InvalidBuffer)
 			ReleaseBuffer(vmbuffer);
 		bms_free(hot_attrs);
+		bms_free(key_attrs);
 		return result;
 	}
 
@@ -2958,7 +3225,7 @@ l2:
 	 * visible while we were busy locking the buffer, or during some
 	 * subsequent window during which we had it unlocked, we'll have to unlock
 	 * and re-lock, to avoid holding the buffer lock across an I/O.  That's a
-	 * bit unfortunate, esepecially since we'll now have to recheck whether
+	 * bit unfortunate, especially since we'll now have to recheck whether
 	 * the tuple has been locked or updated under us, but hopefully it won't
 	 * happen very often.
 	 */
@@ -2991,12 +3258,54 @@ l2:
 		Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
 	}
 
+	/*
+	 * If the tuple we're updating is locked, we need to preserve the locking
+	 * info in the old tuple's Xmax.  Prepare a new Xmax value for this.
+	 */
+	compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
+							  oldtup.t_data->t_infomask,
+							  oldtup.t_data->t_infomask2,
+							  xid, *lockmode, true,
+							  &xmax_old_tuple, &infomask_old_tuple,
+							  &infomask2_old_tuple);
+
+	/* And also prepare an Xmax value for the new copy of the tuple */
+	if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
+		(checked_lockers && !locker_remains))
+		xmax_new_tuple = InvalidTransactionId;
+	else
+		xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
+
+	if (!TransactionIdIsValid(xmax_new_tuple))
+	{
+		infomask_new_tuple = HEAP_XMAX_INVALID;
+		infomask2_new_tuple = 0;
+	}
+	else
+	{
+		if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
+		{
+			GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
+								   &infomask2_new_tuple);
+		}
+		else
+		{
+			infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
+			infomask2_new_tuple = 0;
+		}
+	}
+
+	/*
+	 * Prepare the new tuple with the appropriate initial values of Xmin and
+	 * Xmax, as well as initial infomask bits as computed above.
+	 */
 	newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
 	newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
-	newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED);
 	HeapTupleHeaderSetXmin(newtup->t_data, xid);
 	HeapTupleHeaderSetCmin(newtup->t_data, cid);
-	HeapTupleHeaderSetXmax(newtup->t_data, 0);	/* for cleanliness */
+	newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
+	newtup->t_data->t_infomask2 |= infomask2_new_tuple;
+	HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
 	newtup->t_tableOid = RelationGetRelid(relation);
 
 	/*
@@ -3035,14 +3344,14 @@ l2:
 	if (need_toast || newtupsize > pagefree)
 	{
 		/* Clear obsolete visibility flags ... */
-		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
-									   HEAP_XMAX_INVALID |
-									   HEAP_XMAX_IS_MULTI |
-									   HEAP_IS_LOCKED |
-									   HEAP_MOVED);
+		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+		oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
 		HeapTupleClearHotUpdated(&oldtup);
 		/* ... and store info about transaction updating this tuple */
-		HeapTupleHeaderSetXmax(oldtup.t_data, xid);
+		Assert(TransactionIdIsValid(xmax_old_tuple));
+		HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
+		oldtup.t_data->t_infomask |= infomask_old_tuple;
+		oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
 		/* temporarily make it look not-updated */
 		oldtup.t_data->t_ctid = oldtup.t_self;
@@ -3145,7 +3454,7 @@ l2:
 		 * to do a HOT update.	Check if any of the index columns have been
 		 * changed.  If not, then HOT update is possible.
 		 */
-		if (HeapSatisfiesHOTUpdate(relation, hot_attrs, &oldtup, heaptup))
+		if (satisfies_hot)
 			use_hot_update = true;
 	}
 	else
@@ -3193,13 +3502,13 @@ l2:
 	if (!already_marked)
 	{
 		/* Clear obsolete visibility flags ... */
-		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
-									   HEAP_XMAX_INVALID |
-									   HEAP_XMAX_IS_MULTI |
-									   HEAP_IS_LOCKED |
-									   HEAP_MOVED);
+		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+		oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
 		/* ... and store info about transaction updating this tuple */
-		HeapTupleHeaderSetXmax(oldtup.t_data, xid);
+		Assert(TransactionIdIsValid(xmax_old_tuple));
+		HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
+		oldtup.t_data->t_infomask |= infomask_old_tuple;
+		oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
 	}
 
@@ -3229,8 +3538,8 @@ l2:
 	/* XLOG stuff */
 	if (RelationNeedsWAL(relation))
 	{
-		XLogRecPtr	recptr = log_heap_update(relation, buffer, oldtup.t_self,
-											 newbuf, heaptup,
+		XLogRecPtr	recptr = log_heap_update(relation, buffer,
+											 newbuf, &oldtup, heaptup,
 											 all_visible_cleared,
 											 all_visible_cleared_new);
 
@@ -3272,7 +3581,7 @@ l2:
 	 * Release the lmgr tuple lock, if we had it.
 	 */
 	if (have_tuple_lock)
-		UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+		UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
 
 	pgstat_count_heap_update(relation, use_hot_update);
 
@@ -3287,13 +3596,14 @@ l2:
 	}
 
 	bms_free(hot_attrs);
+	bms_free(key_attrs);
 
 	return HeapTupleMayBeUpdated;
 }
 
 /*
  * Check if the specified attribute's value is same in both given tuples.
- * Subroutine for HeapSatisfiesHOTUpdate.
+ * Subroutine for HeapSatisfiesHOTandKeyUpdate.
  */
 static bool
 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
@@ -3327,7 +3637,7 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
 
 	/*
 	 * Extract the corresponding values.  XXX this is pretty inefficient if
-	 * there are many indexed columns.	Should HeapSatisfiesHOTUpdate do a
+	 * there are many indexed columns.	Should HeapSatisfiesHOTandKeyUpdate do a
 	 * single heap_deform_tuple call on each tuple, instead?  But that doesn't
 	 * work for system columns ...
 	 */
@@ -3370,35 +3680,101 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
 }
 
 /*
- * Check if the old and new tuples represent a HOT-safe update. To be able
- * to do a HOT update, we must not have changed any columns used in index
- * definitions.
+ * Check which columns are being updated.
+ *
+ * This simultaneously checks conditions for HOT updates and for FOR KEY
+ * SHARE updates.  Since much of the time they will be checking very similar
+ * sets of columns, and doing the same tests on them, it makes sense to
+ * optimize and do them together.
  *
- * The set of attributes to be checked is passed in (we dare not try to
- * compute it while holding exclusive buffer lock...)  NOTE that hot_attrs
- * is destructively modified!  That is OK since this is invoked at most once
- * by heap_update().
+ * We receive two bitmapsets comprising the two sets of columns we're
+ * interested in.  Note these are destructively modified; that is OK since
+ * this is invoked at most once in heap_update.
  *
- * Returns true if safe to do HOT update.
+ * hot_result is set to TRUE if it's okay to do a HOT update (i.e. it does not
+ * modified indexed columns); key_result is set to TRUE if the update does not
+ * modify columns used in the key.
  */
-static bool
-HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
-					   HeapTuple oldtup, HeapTuple newtup)
+static void
+HeapSatisfiesHOTandKeyUpdate(Relation relation,
+							 Bitmapset *hot_attrs, Bitmapset *key_attrs,
+							 bool *satisfies_hot, bool *satisfies_key,
+							 HeapTuple oldtup, HeapTuple newtup)
 {
-	int			attrnum;
+	int		next_hot_attnum;
+	int		next_key_attnum;
+	bool	hot_result = true;
+	bool	key_result = true;
+	bool	key_done = false;
+	bool	hot_done = false;
+
+	next_hot_attnum = bms_first_member(hot_attrs);
+	if (next_hot_attnum == -1)
+		hot_done = true;
+	else
+		/* Adjust for system attributes */
+		next_hot_attnum += FirstLowInvalidHeapAttributeNumber;
 
-	while ((attrnum = bms_first_member(hot_attrs)) >= 0)
-	{
+	next_key_attnum = bms_first_member(key_attrs);
+	if (next_key_attnum == -1)
+		key_done = true;
+	else
 		/* Adjust for system attributes */
-		attrnum += FirstLowInvalidHeapAttributeNumber;
+		next_key_attnum += FirstLowInvalidHeapAttributeNumber;
 
-		/* If the attribute value has changed, we can't do HOT update */
-		if (!heap_tuple_attr_equals(RelationGetDescr(relation), attrnum,
-									oldtup, newtup))
-			return false;
+	for (;;)
+	{
+		int		check_now;
+		bool	changed;
+
+		/* both bitmapsets are now empty */
+		if (key_done && hot_done)
+			break;
+
+		/* XXX there's probably an easier way ... */
+		if (hot_done)
+			check_now = next_key_attnum;
+		if (key_done)
+			check_now = next_hot_attnum;
+		else
+			check_now = Min(next_hot_attnum, next_key_attnum);
+
+		changed = !heap_tuple_attr_equals(RelationGetDescr(relation),
+										  check_now, oldtup, newtup);
+		if (changed)
+		{
+			if (check_now == next_hot_attnum)
+				hot_result = false;
+			if (check_now == next_key_attnum)
+				key_result = false;
+		}
+
+		/* if both are false now, we can stop checking */
+		if (!hot_result && !key_result)
+			break;
+
+		if (check_now == next_hot_attnum)
+		{
+			next_hot_attnum = bms_first_member(hot_attrs);
+			if (next_hot_attnum == -1)
+				hot_done = true;
+			else
+				/* Adjust for system attributes */
+				next_hot_attnum += FirstLowInvalidHeapAttributeNumber;
+		}
+		if (check_now == next_key_attnum)
+		{
+			next_key_attnum = bms_first_member(key_attrs);
+			if (next_key_attnum == -1)
+				key_done = true;
+			else
+				/* Adjust for system attributes */
+				next_key_attnum += FirstLowInvalidHeapAttributeNumber;
+		}
 	}
 
-	return true;
+	*satisfies_hot = hot_result;
+	*satisfies_key = key_result;
 }
 
 /*
@@ -3414,11 +3790,12 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
 {
 	HTSU_Result result;
 	HeapUpdateFailureData hufd;
+	LockTupleMode lockmode;
 
 	result = heap_update(relation, otid, tup,
 						 GetCurrentCommandId(true), InvalidSnapshot,
 						 true /* wait for commit */,
-						 &hufd);
+						 &hufd, &lockmode);
 	switch (result)
 	{
 		case HeapTupleSelfUpdated:
@@ -3440,6 +3817,28 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
 	}
 }
 
+
+/*
+ * Return the MultiXactStatus corresponding to the given tuple lock mode.
+ */
+static MultiXactStatus
+get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
+{
+	MultiXactStatus		retval;
+
+	if (is_update)
+		retval = tupleLockExtraInfo[mode].updstatus;
+	else
+		retval = tupleLockExtraInfo[mode].lockstatus;
+
+	if (retval == -1)
+		elog(ERROR, "invalid lock tuple mode %d/%s", mode,
+			 is_update ? "true" : "false");
+
+	return retval;
+}
+
+
 /*
  *	heap_lock_tuple - lock a tuple in shared or exclusive mode
  *
@@ -3452,6 +3851,8 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
  *		tuple's cmax if lock is successful)
  *	mode: indicates if shared or exclusive tuple lock is desired
  *	nowait: if true, ereport rather than blocking if lock not available
+ *	follow_updates: if true, follow the update chain to also lock descendant
+ *		tuples.
  *
  * Output parameters:
  *	*tuple: all fields filled in
@@ -3464,61 +3865,30 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
  *	HeapTupleUpdated: lock failed because tuple updated by other xact
  *
  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
- * t_xmax, and t_cmax (the last only for HeapTupleSelfUpdated, since we
+ * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
+ * (the last only for HeapTupleSelfUpdated, since we
  * cannot obtain cmax from a combocid generated by another transaction).
  * See comments for struct HeapUpdateFailureData for additional info.
  *
- *
- * NOTES: because the shared-memory lock table is of finite size, but users
- * could reasonably want to lock large numbers of tuples, we do not rely on
- * the standard lock manager to store tuple-level locks over the long term.
- * Instead, a tuple is marked as locked by setting the current transaction's
- * XID as its XMAX, and setting additional infomask bits to distinguish this
- * usage from the more normal case of having deleted the tuple.  When
- * multiple transactions concurrently share-lock a tuple, the first locker's
- * XID is replaced in XMAX with a MultiTransactionId representing the set of
- * XIDs currently holding share-locks.
- *
- * When it is necessary to wait for a tuple-level lock to be released, the
- * basic delay is provided by XactLockTableWait or MultiXactIdWait on the
- * contents of the tuple's XMAX.  However, that mechanism will release all
- * waiters concurrently, so there would be a race condition as to which
- * waiter gets the tuple, potentially leading to indefinite starvation of
- * some waiters.  The possibility of share-locking makes the problem much
- * worse --- a steady stream of share-lockers can easily block an exclusive
- * locker forever.	To provide more reliable semantics about who gets a
- * tuple-level lock first, we use the standard lock manager.  The protocol
- * for waiting for a tuple-level lock is really
- *		LockTuple()
- *		XactLockTableWait()
- *		mark tuple as locked by me
- *		UnlockTuple()
- * When there are multiple waiters, arbitration of who is to get the lock next
- * is provided by LockTuple().	However, at most one tuple-level lock will
- * be held or awaited per backend at any time, so we don't risk overflow
- * of the lock table.  Note that incoming share-lockers are required to
- * do LockTuple as well, if there is any conflict, to ensure that they don't
- * starve out waiting exclusive-lockers.  However, if there is not any active
- * conflict for a tuple, we don't incur any extra overhead.
+ * See README.tuplock for a thorough explanation of this mechanism.
  */
 HTSU_Result
 heap_lock_tuple(Relation relation, HeapTuple tuple,
 				CommandId cid, LockTupleMode mode, bool nowait,
+				bool follow_updates,
 				Buffer *buffer, HeapUpdateFailureData *hufd)
 {
 	HTSU_Result result;
 	ItemPointer tid = &(tuple->t_self);
 	ItemId		lp;
 	Page		page;
-	TransactionId xid;
-	TransactionId xmax;
-	uint16		old_infomask;
-	uint16		new_infomask;
-	LOCKMODE	tuple_lock_type;
+	TransactionId xid,
+				xmax;
+	uint16		old_infomask,
+				new_infomask,
+				new_infomask2;
 	bool		have_tuple_lock = false;
 
-	tuple_lock_type = (mode == LockTupleShared) ? ShareLock : ExclusiveLock;
-
 	*buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
 	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
 
@@ -3542,30 +3912,58 @@ l3:
 	{
 		TransactionId xwait;
 		uint16		infomask;
+		uint16		infomask2;
+		bool		require_sleep;
+		ItemPointerData	t_ctid;
 
 		/* must copy state data before unlocking buffer */
-		xwait = HeapTupleHeaderGetXmax(tuple->t_data);
+		xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
 		infomask = tuple->t_data->t_infomask;
+		infomask2 = tuple->t_data->t_infomask2;
+		ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
 
 		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
 
 		/*
-		 * If we wish to acquire share lock, and the tuple is already
-		 * share-locked by a multixact that includes any subtransaction of the
-		 * current top transaction, then we effectively hold the desired lock
-		 * already.  We *must* succeed without trying to take the tuple lock,
-		 * else we will deadlock against anyone waiting to acquire exclusive
-		 * lock.  We don't need to make any state changes in this case.
+		 * If any subtransaction of the current top transaction already holds a
+		 * lock as strong or stronger than what we're requesting, we
+		 * effectively hold the desired lock already.  We *must* succeed
+		 * without trying to take the tuple lock, else we will deadlock against
+		 * anyone wanting to acquire a stronger lock.
 		 */
-		if (mode == LockTupleShared &&
-			(infomask & HEAP_XMAX_IS_MULTI) &&
-			MultiXactIdIsCurrent((MultiXactId) xwait))
+		if (infomask & HEAP_XMAX_IS_MULTI)
 		{
-			Assert(infomask & HEAP_XMAX_SHARED_LOCK);
-			/* Probably can't hold tuple lock here, but may as well check */
-			if (have_tuple_lock)
-				UnlockTuple(relation, tid, tuple_lock_type);
-			return HeapTupleMayBeUpdated;
+			int		i;
+			int		nmembers;
+			MultiXactMember *members;
+
+			/*
+			 * We don't need to allow old multixacts here; if that had been the
+			 * case, HeapTupleSatisfiesUpdate would have returned MayBeUpdated
+			 * and we wouldn't be here.
+			 */
+			nmembers = GetMultiXactIdMembers(xwait, &members, false);
+
+			for (i = 0; i < nmembers; i++)
+			{
+				if (TransactionIdIsCurrentTransactionId(members[i].xid))
+				{
+					LockTupleMode	membermode;
+
+					membermode = TUPLOCK_from_mxstatus(members[i].status);
+
+					if (membermode >= mode)
+					{
+						if (have_tuple_lock)
+							UnlockTupleTuplock(relation, tid, mode);
+
+						pfree(members);
+						return HeapTupleMayBeUpdated;
+					}
+				}
+			}
+
+			pfree(members);
 		}
 
 		/*
@@ -3581,255 +3979,435 @@ l3:
 		{
 			if (nowait)
 			{
-				if (!ConditionalLockTuple(relation, tid, tuple_lock_type))
+				if (!ConditionalLockTupleTuplock(relation, tid, mode))
 					ereport(ERROR,
 							(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
-					errmsg("could not obtain lock on row in relation \"%s\"",
-						   RelationGetRelationName(relation))));
+							 errmsg("could not obtain lock on row in relation \"%s\"",
+									RelationGetRelationName(relation))));
 			}
 			else
-				LockTuple(relation, tid, tuple_lock_type);
+				LockTupleTuplock(relation, tid, mode);
 			have_tuple_lock = true;
 		}
 
-		if (mode == LockTupleShared && (infomask & HEAP_XMAX_SHARED_LOCK))
+		/*
+		 * Initially assume that we will have to wait for the locking
+		 * transaction(s) to finish.  We check various cases below in which
+		 * this can be turned off.
+		 */
+		require_sleep = true;
+		if (mode == LockTupleKeyShare)
 		{
 			/*
-			 * Acquiring sharelock when there's at least one sharelocker
-			 * already.  We need not wait for him/them to complete.
-			 */
-			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
-
-			/*
-			 * Make sure it's still a shared lock, else start over.  (It's OK
-			 * if the ownership of the shared lock has changed, though.)
+			 * If we're requesting KeyShare, and there's no update present, we
+			 * don't need to wait.  Even if there is an update, we can still
+			 * continue if the key hasn't been modified.
+			 *
+			 * However, if there are updates, we need to walk the update chain
+			 * to mark future versions of the row as locked, too.  That way, if
+			 * somebody deletes that future version, we're protected against
+			 * the key going away.  This locking of future versions could block
+			 * momentarily, if a concurrent transaction is deleting a key; or
+			 * it could return a value to the effect that the transaction
+			 * deleting the key has already committed.  So we do this before
+			 * re-locking the buffer; otherwise this would be prone to
+			 * deadlocks.
+			 *
+			 * Note that the TID we're locking was grabbed before we unlocked
+			 * the buffer.  For it to change while we're not looking, the other
+			 * properties we're testing for below after re-locking the buffer
+			 * would also change, in which case we would restart this loop
+			 * above.
 			 */
-			if (!(tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK))
-				goto l3;
-		}
-		else if (infomask & HEAP_XMAX_IS_MULTI)
-		{
-			/* wait for multixact to end */
-			if (nowait)
+			if (!(infomask2 & HEAP_KEYS_UPDATED))
 			{
-				if (!ConditionalMultiXactIdWait((MultiXactId) xwait))
-					ereport(ERROR,
-							(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
-					errmsg("could not obtain lock on row in relation \"%s\"",
-						   RelationGetRelationName(relation))));
-			}
-			else
-				MultiXactIdWait((MultiXactId) xwait);
+				bool	updated;
 
-			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+				updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
 
-			/*
-			 * If xwait had just locked the tuple then some other xact could
-			 * update this tuple before we get to this point. Check for xmax
-			 * change, and start over if so.
-			 */
-			if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-				!TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
-									 xwait))
-				goto l3;
+				/*
+				 * If there are updates, follow the update chain; bail out
+				 * if that cannot be done.
+				 */
+				if (follow_updates && updated)
+				{
+					HTSU_Result		res;
+
+					res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+												  GetCurrentTransactionId(),
+												  mode);
+					if (res != HeapTupleMayBeUpdated)
+					{
+						result = res;
+						/* recovery code expects to have buffer lock held */
+						LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+						goto failed;
+					}
+				}
 
-			/*
-			 * You might think the multixact is necessarily done here, but not
-			 * so: it could have surviving members, namely our own xact or
-			 * other subxacts of this backend.	It is legal for us to lock the
-			 * tuple in either case, however.  We don't bother changing the
-			 * on-disk hint bits since we are about to overwrite the xmax
-			 * altogether.
-			 */
+				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+
+				/*
+				 * Make sure it's still an appropriate lock, else start over.
+				 * Also, if it wasn't updated before we released the lock, but
+				 * is updated now, we start over too; the reason is that we now
+				 * need to follow the update chain to lock the new versions.
+				 */
+				if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
+					((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
+					 !updated))
+					goto l3;
+
+				/* Things look okay, so we can skip sleeping */
+				require_sleep = false;
+
+				/*
+				 * Note we allow Xmax to change here; other updaters/lockers
+				 * could have modified it before we grabbed the buffer lock.
+				 * However, this is not a problem, because with the recheck we
+				 * just did we ensure that they still don't conflict with the
+				 * lock we want.
+				 */
+			}
 		}
-		else
+		else if (mode == LockTupleShare)
 		{
-			/* wait for regular transaction to end */
-			if (nowait)
+			/*
+			 * If we're requesting Share, we can similarly avoid sleeping if
+			 * there's no update and no exclusive lock present.
+			 */
+			if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
+				!HEAP_XMAX_IS_EXCL_LOCKED(infomask))
 			{
-				if (!ConditionalXactLockTableWait(xwait))
-					ereport(ERROR,
-							(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
-					errmsg("could not obtain lock on row in relation \"%s\"",
-						   RelationGetRelationName(relation))));
-			}
-			else
-				XactLockTableWait(xwait);
-
-			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
 
+				/*
+				 * Make sure it's still an appropriate lock, else start over.
+				 * See above about allowing xmax to change.
+				 */
+				if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
+					HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
+					goto l3;
+				require_sleep = false;
+			}
+		}
+		else if (mode == LockTupleNoKeyExclusive)
+		{
 			/*
-			 * xwait is done, but if xwait had just locked the tuple then some
-			 * other xact could update this tuple before we get to this point.
-			 * Check for xmax change, and start over if so.
+			 * If we're requesting NoKeyExclusive, we might also be able to
+			 * avoid sleeping; just ensure that there's no other lock type than
+			 * KeyShare.  Note that this is a bit more involved than just
+			 * checking hint bits -- we need to expand the multixact to figure
+			 * out lock modes for each one (unless there was only one such
+			 * locker).
 			 */
-			if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
-				!TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
-									 xwait))
-				goto l3;
+			if (infomask & HEAP_XMAX_IS_MULTI)
+			{
+				int		nmembers;
+				MultiXactMember *members;
 
-			/* Otherwise check if it committed or aborted */
-			UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
+				/*
+				 * We don't need to allow old multixacts here; if that had been
+				 * the case, HeapTupleSatisfiesUpdate would have returned
+				 * MayBeUpdated and we wouldn't be here.
+				 */
+				nmembers = GetMultiXactIdMembers(xwait, &members, false);
+
+				if (nmembers <= 0)
+				{
+					/*
+					 * No need to keep the previous xmax here. This is unlikely
+					 * to happen.
+					 */
+					require_sleep = false;
+				}
+				else
+				{
+					int		i;
+					bool	allowed = true;
+
+					for (i = 0; i < nmembers; i++)
+					{
+						if (members[i].status != MultiXactStatusForKeyShare)
+						{
+							allowed = false;
+							break;
+						}
+					}
+					if (allowed)
+					{
+						/*
+						 * if the xmax changed under us in the meantime, start
+						 * over.
+						 */
+						LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+						if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+							!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
+												 xwait))
+						{
+							pfree(members);
+							goto l3;
+						}
+						/* otherwise, we're good */
+						require_sleep = false;
+					}
+
+					pfree(members);
+				}
+			}
+			else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
+			{
+				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+
+				/* if the xmax changed in the meantime, start over */
+				if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
+										 xwait))
+					goto l3;
+				/* otherwise, we're good */
+				require_sleep = false;
+			}
 		}
 
 		/*
-		 * We may lock if previous xmax aborted, or if it committed but only
-		 * locked the tuple without updating it.  The case where we didn't
-		 * wait because we are joining an existing shared lock is correctly
-		 * handled, too.
+		 * By here, we either have already acquired the buffer exclusive lock,
+		 * or we must wait for the locking transaction or multixact; so below
+		 * we ensure that we grab buffer lock after the sleep.
 		 */
-		if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID |
-										 HEAP_IS_LOCKED))
-			result = HeapTupleMayBeUpdated;
-		else
-			result = HeapTupleUpdated;
-	}
 
-	if (result != HeapTupleMayBeUpdated)
-	{
-		Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
-		Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
-		hufd->ctid = tuple->t_data->t_ctid;
-		hufd->xmax = HeapTupleHeaderGetXmax(tuple->t_data);
-		if (result == HeapTupleSelfUpdated)
-			hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
-		else
-			hufd->cmax = 0;		/* for lack of an InvalidCommandId value */
-		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
-		if (have_tuple_lock)
-			UnlockTuple(relation, tid, tuple_lock_type);
-		return result;
-	}
+		if (require_sleep)
+		{
+			if (infomask & HEAP_XMAX_IS_MULTI)
+			{
+				MultiXactStatus status = get_mxact_status_for_lock(mode, false);
 
-	/*
-	 * We might already hold the desired lock (or stronger), possibly under a
-	 * different subtransaction of the current top transaction.  If so, there
-	 * is no need to change state or issue a WAL record.  We already handled
-	 * the case where this is true for xmax being a MultiXactId, so now check
-	 * for cases where it is a plain TransactionId.
-	 *
-	 * Note in particular that this covers the case where we already hold
-	 * exclusive lock on the tuple and the caller only wants shared lock. It
-	 * would certainly not do to give up the exclusive lock.
-	 */
-	xmax = HeapTupleHeaderGetXmax(tuple->t_data);
-	old_infomask = tuple->t_data->t_infomask;
-
-	if (!(old_infomask & (HEAP_XMAX_INVALID |
-						  HEAP_XMAX_COMMITTED |
-						  HEAP_XMAX_IS_MULTI)) &&
-		(mode == LockTupleShared ?
-		 (old_infomask & HEAP_IS_LOCKED) :
-		 (old_infomask & HEAP_XMAX_EXCL_LOCK)) &&
-		TransactionIdIsCurrentTransactionId(xmax))
-	{
-		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
-		/* Probably can't hold tuple lock here, but may as well check */
-		if (have_tuple_lock)
-			UnlockTuple(relation, tid, tuple_lock_type);
-		return HeapTupleMayBeUpdated;
-	}
+				/* We only ever lock tuples, never update them */
+				if (status >= MultiXactStatusNoKeyUpdate)
+					elog(ERROR, "invalid lock mode in heap_lock_tuple");
 
-	/*
-	 * Compute the new xmax and infomask to store into the tuple.  Note we do
-	 * not modify the tuple just yet, because that would leave it in the wrong
-	 * state if multixact.c elogs.
-	 */
-	xid = GetCurrentTransactionId();
-
-	new_infomask = old_infomask & ~(HEAP_XMAX_COMMITTED |
-									HEAP_XMAX_INVALID |
-									HEAP_XMAX_IS_MULTI |
-									HEAP_IS_LOCKED |
-									HEAP_MOVED);
+				/* wait for multixact to end */
+				if (nowait)
+				{
+					if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
+													status, NULL, infomask))
+						ereport(ERROR,
+								(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+								 errmsg("could not obtain lock on row in relation \"%s\"",
+										RelationGetRelationName(relation))));
+				}
+				else
+					MultiXactIdWait((MultiXactId) xwait, status, NULL, infomask);
 
-	if (mode == LockTupleShared)
-	{
-		/*
-		 * If this is the first acquisition of a shared lock in the current
-		 * transaction, set my per-backend OldestMemberMXactId setting. We can
-		 * be certain that the transaction will never become a member of any
-		 * older MultiXactIds than that.  (We have to do this even if we end
-		 * up just using our own TransactionId below, since some other backend
-		 * could incorporate our XID into a MultiXact immediately afterwards.)
-		 */
-		MultiXactIdSetOldestMember();
+				/* if there are updates, follow the update chain */
+				if (follow_updates &&
+					!HEAP_XMAX_IS_LOCKED_ONLY(infomask))
+				{
+					HTSU_Result		res;
+
+					res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+												  GetCurrentTransactionId(),
+												  mode);
+					if (res != HeapTupleMayBeUpdated)
+					{
+						result = res;
+						/* recovery code expects to have buffer lock held */
+						LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+						goto failed;
+					}
+				}
 
-		new_infomask |= HEAP_XMAX_SHARED_LOCK;
+				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
 
-		/*
-		 * Check to see if we need a MultiXactId because there are multiple
-		 * lockers.
-		 *
-		 * HeapTupleSatisfiesUpdate will have set the HEAP_XMAX_INVALID bit if
-		 * the xmax was a MultiXactId but it was not running anymore. There is
-		 * a race condition, which is that the MultiXactId may have finished
-		 * since then, but that uncommon case is handled within
-		 * MultiXactIdExpand.
-		 *
-		 * There is a similar race condition possible when the old xmax was a
-		 * regular TransactionId.  We test TransactionIdIsInProgress again
-		 * just to narrow the window, but it's still possible to end up
-		 * creating an unnecessary MultiXactId.  Fortunately this is harmless.
-		 */
-		if (!(old_infomask & (HEAP_XMAX_INVALID | HEAP_XMAX_COMMITTED)))
-		{
-			if (old_infomask & HEAP_XMAX_IS_MULTI)
-			{
 				/*
-				 * If the XMAX is already a MultiXactId, then we need to
-				 * expand it to include our own TransactionId.
+				 * If xwait had just locked the tuple then some other xact
+				 * could update this tuple before we get to this point. Check
+				 * for xmax change, and start over if so.
 				 */
-				xid = MultiXactIdExpand((MultiXactId) xmax, xid);
-				new_infomask |= HEAP_XMAX_IS_MULTI;
-			}
-			else if (TransactionIdIsInProgress(xmax))
-			{
+				if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
+										 xwait))
+					goto l3;
+
 				/*
-				 * If the XMAX is a valid TransactionId, then we need to
-				 * create a new MultiXactId that includes both the old locker
-				 * and our own TransactionId.
+				 * Of course, the multixact might not be done here: if we're
+				 * requesting a light lock mode, other transactions with light
+				 * locks could still be alive, as well as locks owned by our
+				 * own xact or other subxacts of this backend.  We need to
+				 * preserve the surviving MultiXact members.  Note that it
+				 * isn't absolutely necessary in the latter case, but doing so
+				 * is simpler.
 				 */
-				xid = MultiXactIdCreate(xmax, xid);
-				new_infomask |= HEAP_XMAX_IS_MULTI;
 			}
 			else
 			{
+				/* wait for regular transaction to end */
+				if (nowait)
+				{
+					if (!ConditionalXactLockTableWait(xwait))
+						ereport(ERROR,
+								(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+								 errmsg("could not obtain lock on row in relation \"%s\"",
+										RelationGetRelationName(relation))));
+				}
+				else
+					XactLockTableWait(xwait);
+
+				/* if there are updates, follow the update chain */
+				if (follow_updates &&
+					!HEAP_XMAX_IS_LOCKED_ONLY(infomask))
+				{
+					HTSU_Result		res;
+
+					res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+												  GetCurrentTransactionId(),
+												  mode);
+					if (res != HeapTupleMayBeUpdated)
+					{
+						result = res;
+						/* recovery code expects to have buffer lock held */
+						LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+						goto failed;
+					}
+				}
+
+				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+
 				/*
-				 * Can get here iff HeapTupleSatisfiesUpdate saw the old xmax
-				 * as running, but it finished before
-				 * TransactionIdIsInProgress() got to run.	Treat it like
-				 * there's no locker in the tuple.
+				 * xwait is done, but if xwait had just locked the tuple then
+				 * some other xact could update this tuple before we get to
+				 * this point.  Check for xmax change, and start over if so.
 				 */
+				if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
+										 xwait))
+					goto l3;
+
+				/*
+				 * Otherwise check if it committed or aborted.  Note we cannot
+				 * be here if the tuple was only locked by somebody who didn't
+				 * conflict with us; that should have been handled above.  So
+				 * that transaction must necessarily be gone by now.
+				 */
+				UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
 			}
 		}
+
+		/* By here, we're certain that we hold buffer exclusive lock again */
+
+		/*
+		 * We may lock if previous xmax aborted, or if it committed but only
+		 * locked the tuple without updating it; or if we didn't have to wait
+		 * at all for whatever reason.
+		 */
+		if (!require_sleep ||
+			(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
+			HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
+			HeapTupleHeaderIsOnlyLocked(tuple->t_data))
+			result = HeapTupleMayBeUpdated;
 		else
-		{
-			/*
-			 * There was no previous locker, so just insert our own
-			 * TransactionId.
-			 */
-		}
+			result = HeapTupleUpdated;
 	}
-	else
+
+failed:
+	if (result != HeapTupleMayBeUpdated)
+	{
+		Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
+		Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
+		hufd->ctid = tuple->t_data->t_ctid;
+		hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
+		if (result == HeapTupleSelfUpdated)
+			hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
+		else
+			hufd->cmax = 0;		/* for lack of an InvalidCommandId value */
+		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
+		if (have_tuple_lock)
+			UnlockTupleTuplock(relation, tid, mode);
+		return result;
+	}
+
+	xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
+	old_infomask = tuple->t_data->t_infomask;
+
+	/*
+	 * We might already hold the desired lock (or stronger), possibly under a
+	 * different subtransaction of the current top transaction.  If so, there
+	 * is no need to change state or issue a WAL record.  We already handled
+	 * the case where this is true for xmax being a MultiXactId, so now check
+	 * for cases where it is a plain TransactionId.
+	 *
+	 * Note in particular that this covers the case where we already hold
+	 * exclusive lock on the tuple and the caller only wants key share or share
+	 * lock. It would certainly not do to give up the exclusive lock.
+	 */
+	if (!(old_infomask & (HEAP_XMAX_INVALID |
+						  HEAP_XMAX_COMMITTED |
+						  HEAP_XMAX_IS_MULTI)) &&
+		(mode == LockTupleKeyShare ?
+		 (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask) ||
+		  HEAP_XMAX_IS_SHR_LOCKED(old_infomask) ||
+		  HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) :
+		 mode == LockTupleShare ?
+		 (HEAP_XMAX_IS_SHR_LOCKED(old_infomask) ||
+		  HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) :
+		 (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))) &&
+		TransactionIdIsCurrentTransactionId(xmax))
 	{
-		/* We want an exclusive lock on the tuple */
-		new_infomask |= HEAP_XMAX_EXCL_LOCK;
+		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
+		/* Probably can't hold tuple lock here, but may as well check */
+		if (have_tuple_lock)
+			UnlockTupleTuplock(relation, tid, mode);
+		return HeapTupleMayBeUpdated;
 	}
 
+	/*
+	 * If this is the first possibly-multixact-able operation in the
+	 * current transaction, set my per-backend OldestMemberMXactId setting.
+	 * We can be certain that the transaction will never become a member of
+	 * any older MultiXactIds than that.  (We have to do this even if we
+	 * end up just using our own TransactionId below, since some other
+	 * backend could incorporate our XID into a MultiXact immediately
+	 * afterwards.)
+	 */
+	MultiXactIdSetOldestMember();
+
+	/*
+	 * Compute the new xmax and infomask to store into the tuple.  Note we do
+	 * not modify the tuple just yet, because that would leave it in the wrong
+	 * state if multixact.c elogs.
+	 */
+	compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
+							  GetCurrentTransactionId(), mode, false,
+							  &xid, &new_infomask, &new_infomask2);
+
 	START_CRIT_SECTION();
 
 	/*
 	 * Store transaction information of xact locking the tuple.
 	 *
 	 * Note: Cmax is meaningless in this context, so don't set it; this avoids
-	 * possibly generating a useless combo CID.
+	 * possibly generating a useless combo CID.  Moreover, if we're locking a
+	 * previously updated tuple, it's important to preserve the Cmax.
+	 *
+	 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
+	 * we would break the HOT chain.
 	 */
-	tuple->t_data->t_infomask = new_infomask;
-	HeapTupleHeaderClearHotUpdated(tuple->t_data);
+	tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
+	tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+	tuple->t_data->t_infomask |= new_infomask;
+	tuple->t_data->t_infomask2 |= new_infomask2;
+	if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
+		HeapTupleHeaderClearHotUpdated(tuple->t_data);
 	HeapTupleHeaderSetXmax(tuple->t_data, xid);
-	/* Make sure there is no forward chain link in t_ctid */
-	tuple->t_data->t_ctid = *tid;
+
+	/*
+	 * Make sure there is no forward chain link in t_ctid.  Note that in the
+	 * cases where the tuple has been updated, we must not overwrite t_ctid,
+	 * because it was set by the updater.  Moreover, if the tuple has been
+	 * updated, we need to follow the update chain to lock the new versions
+	 * of the tuple as well.
+	 */
+	if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
+		tuple->t_data->t_ctid = *tid;
 
 	MarkBufferDirty(*buffer);
 
@@ -3854,8 +4432,8 @@ l3:
 		xlrec.target.node = relation->rd_node;
 		xlrec.target.tid = tuple->t_self;
 		xlrec.locking_xid = xid;
-		xlrec.xid_is_mxact = ((new_infomask & HEAP_XMAX_IS_MULTI) != 0);
-		xlrec.shared_lock = (mode == LockTupleShared);
+		xlrec.infobits_set = compute_infobits(new_infomask,
+											  tuple->t_data->t_infomask2);
 		rdata[0].data = (char *) &xlrec;
 		rdata[0].len = SizeOfHeapLock;
 		rdata[0].buffer = InvalidBuffer;
@@ -3887,8 +4465,469 @@ l3:
 	 * release the lmgr tuple lock, if we had it.
 	 */
 	if (have_tuple_lock)
-		UnlockTuple(relation, tid, tuple_lock_type);
+		UnlockTupleTuplock(relation, tid, mode);
+
+	return HeapTupleMayBeUpdated;
+}
+
+
+/*
+ * Given an original set of Xmax and infomask, and a transaction (identified by
+ * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
+ * corresponding infomasks to use on the tuple.
+ *
+ * Note that this might have side effects such as creating a new MultiXactId.
+ *
+ * Most callers will have called HeapTupleSatisfiesUpdate before this function;
+ * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
+ * but it was not running anymore. There is a race condition, which is that the
+ * MultiXactId may have finished since then, but that uncommon case is handled
+ * either here, or within MultiXactIdExpand.
+ *
+ * There is a similar race condition possible when the old xmax was a regular
+ * TransactionId.  We test TransactionIdIsInProgress again just to narrow the
+ * window, but it's still possible to end up creating an unnecessary
+ * MultiXactId.  Fortunately this is harmless.
+ */
+static void
+compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
+						  uint16 old_infomask2, TransactionId add_to_xmax,
+						  LockTupleMode mode, bool is_update,
+						  TransactionId *result_xmax, uint16 *result_infomask,
+						  uint16 *result_infomask2)
+{
+	TransactionId	new_xmax;
+	uint16			new_infomask,
+					new_infomask2;
+
+l5:
+	new_infomask = 0;
+	new_infomask2 = 0;
+	if (old_infomask & HEAP_XMAX_INVALID)
+	{
+		/*
+		 * No previous locker; we just insert our own TransactionId.
+		 */
+		if (is_update)
+		{
+			new_xmax = add_to_xmax;
+			if (mode == LockTupleExclusive)
+				new_infomask2 |= HEAP_KEYS_UPDATED;
+		}
+		else
+		{
+			new_infomask |= HEAP_XMAX_LOCK_ONLY;
+			switch (mode)
+			{
+				case LockTupleKeyShare:
+					new_xmax = add_to_xmax;
+					new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
+					break;
+				case LockTupleShare:
+					new_xmax = add_to_xmax;
+					new_infomask |= HEAP_XMAX_SHR_LOCK;
+					break;
+				case LockTupleNoKeyExclusive:
+					new_xmax = add_to_xmax;
+					new_infomask |= HEAP_XMAX_EXCL_LOCK;
+					break;
+				case LockTupleExclusive:
+					new_xmax = add_to_xmax;
+					new_infomask |= HEAP_XMAX_EXCL_LOCK;
+					new_infomask2 |= HEAP_KEYS_UPDATED;
+					break;
+				default:
+					new_xmax = InvalidTransactionId;	/* silence compiler */
+					elog(ERROR, "invalid lock mode");
+			}
+		}
+	}
+	else if (old_infomask & HEAP_XMAX_IS_MULTI)
+	{
+		MultiXactStatus		new_status;
+
+		/*
+		 * Currently we don't allow XMAX_COMMITTED to be set for multis,
+		 * so cross-check.
+		 */
+		Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
+
+		/*
+		 * A multixact together with LOCK_ONLY set but neither lock bit set
+		 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
+		 * anymore.  This check is critical for databases upgraded by
+		 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
+		 * that such multis are never passed.
+		 */
+		if (!(old_infomask & HEAP_LOCK_MASK) &&
+			HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
+		{
+			old_infomask &= ~HEAP_XMAX_IS_MULTI;
+			old_infomask |= HEAP_XMAX_INVALID;
+			goto l5;
+		}
+
+		/*
+		 * If the XMAX is already a MultiXactId, then we need to expand it to
+		 * include add_to_xmax; but if all the members were lockers and are all
+		 * gone, we can do away with the IS_MULTI bit and just set add_to_xmax
+		 * as the only locker/updater.  If all lockers are gone and we have an
+		 * updater that aborted, we can also do without a multi.
+		 *
+		 * The cost of doing GetMultiXactIdMembers would be paid by
+		 * MultiXactIdExpand if we weren't to do this, so this check is not
+		 * incurring extra work anyhow.
+		 */
+		if (!MultiXactIdIsRunning(xmax))
+		{
+			if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
+				TransactionIdDidAbort(MultiXactIdGetUpdateXid(xmax,
+															  old_infomask)))
+			{
+				/*
+				 * Reset these bits and restart; otherwise fall through to
+				 * create a new multi below.
+				 */
+				old_infomask &= ~HEAP_XMAX_IS_MULTI;
+				old_infomask |= HEAP_XMAX_INVALID;
+				goto l5;
+			}
+		}
+
+		new_status = get_mxact_status_for_lock(mode, is_update);
+
+		new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
+									 new_status);
+		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
+	}
+	else if (old_infomask & HEAP_XMAX_COMMITTED)
+	{
+		/*
+		 * It's a committed update, so we need to preserve him as updater of
+		 * the tuple.
+		 */
+		MultiXactStatus		status;
+		MultiXactStatus		new_status;
+
+		if (old_infomask2 & HEAP_KEYS_UPDATED)
+			status = MultiXactStatusUpdate;
+		else
+			status = MultiXactStatusNoKeyUpdate;
+
+		new_status = get_mxact_status_for_lock(mode, is_update);
+		/*
+		 * since it's not running, it's obviously impossible for the old
+		 * updater to be identical to the current one, so we need not check
+		 * for that case as we do in the block above.
+		 */
+		new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
+		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
+	}
+	else if (TransactionIdIsInProgress(xmax))
+	{
+		/*
+		 * If the XMAX is a valid, in-progress TransactionId, then we need to
+		 * create a new MultiXactId that includes both the old locker or
+		 * updater and our own TransactionId.
+		 */
+		MultiXactStatus		status;
+		MultiXactStatus		new_status;
+
+		if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
+		{
+			if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
+				status = MultiXactStatusForKeyShare;
+			else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
+				status = MultiXactStatusForShare;
+			else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
+			{
+				if (old_infomask2 & HEAP_KEYS_UPDATED)
+					status = MultiXactStatusForUpdate;
+				else
+					status = MultiXactStatusForNoKeyUpdate;
+			}
+			else
+			{
+				/*
+				 * LOCK_ONLY can be present alone only when a page has been
+				 * upgraded by pg_upgrade.  But in that case,
+				 * TransactionIdIsInProgress() should have returned false.  We
+				 * assume it's no longer locked in this case.
+				 */
+				elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
+				old_infomask |= HEAP_XMAX_INVALID;
+				old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
+				goto l5;
+			}
+		}
+		else
+		{
+			/* it's an update, but which kind? */
+			if (old_infomask2 & HEAP_KEYS_UPDATED)
+				status = MultiXactStatusUpdate;
+			else
+				status = MultiXactStatusNoKeyUpdate;
+		}
+
+		new_status = get_mxact_status_for_lock(mode, is_update);
+
+		/*
+		 * If the existing lock mode is identical to or weaker than the new
+		 * one, we can act as though there is no existing lock, so set
+		 * XMAX_INVALID and restart.
+		 */
+		if (xmax == add_to_xmax)
+		{
+			LockTupleMode	old_mode = TUPLOCK_from_mxstatus(status);
+			bool			old_isupd = ISUPDATE_from_mxstatus(status);
+
+			/*
+			 * We can do this if the new LockTupleMode is higher or equal than
+			 * the old one; and if there was previously an update, we need an
+			 * update, but if there wasn't, then we can accept there not being
+			 * one.
+			 */
+			if ((mode >= old_mode) && (is_update || !old_isupd))
+			{
+				/*
+				 * Note that the infomask might contain some other dirty bits.
+				 * However, since the new infomask is reset to zero, we only
+				 * set what's minimally necessary, and that the case that
+				 * checks HEAP_XMAX_INVALID is the very first above, there is
+				 * no need for extra cleanup of the infomask here.
+				 */
+				old_infomask |= HEAP_XMAX_INVALID;
+				goto l5;
+			}
+		}
+		new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
+		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
+	}
+	else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
+			 TransactionIdDidCommit(xmax))
+	{
+		/*
+		 * It's a committed update, so we gotta preserve him as updater of the
+		 * tuple.
+		 */
+		MultiXactStatus		status;
+		MultiXactStatus		new_status;
+
+		if (old_infomask2 & HEAP_KEYS_UPDATED)
+			status = MultiXactStatusUpdate;
+		else
+			status = MultiXactStatusNoKeyUpdate;
+
+		new_status = get_mxact_status_for_lock(mode, is_update);
+		/*
+		 * since it's not running, it's obviously impossible for the old
+		 * updater to be identical to the current one, so we need not check
+		 * for that case as we do in the block above.
+		 */
+		new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
+		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
+	}
+	else
+	{
+		/*
+		 * Can get here iff the locking/updating transaction was running when
+		 * the infomask was extracted from the tuple, but finished before
+		 * TransactionIdIsInProgress got to run.  Deal with it as if there was
+		 * no locker at all in the first place.
+		 */
+		old_infomask |= HEAP_XMAX_INVALID;
+		goto l5;
+	}
+
+	*result_infomask = new_infomask;
+	*result_infomask2 = new_infomask2;
+	*result_xmax = new_xmax;
+}
+
+
+/*
+ * Recursive part of heap_lock_updated_tuple
+ *
+ * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
+ * xid with the given mode; if this tuple is updated, recurse to lock the new
+ * version as well.
+ */
+static HTSU_Result
+heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
+							LockTupleMode mode)
+{
+	ItemPointerData	tupid;
+	HeapTupleData	mytup;
+	Buffer			buf;
+	uint16			new_infomask,
+					new_infomask2,
+					old_infomask;
+	TransactionId	xmax,
+					new_xmax;
+
+	ItemPointerCopy(tid, &tupid);
+
+	for (;;)
+	{
+		new_infomask = 0;
+		new_xmax = InvalidTransactionId;
+		ItemPointerCopy(&tupid, &(mytup.t_self));
+
+		if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL))
+			elog(ERROR, "unable to fetch updated version of tuple");
+
+l4:
+		CHECK_FOR_INTERRUPTS();
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+		old_infomask = mytup.t_data->t_infomask;
+		xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
+
+		/*
+		 * If this tuple is updated and the key has been modified (or deleted),
+		 * what we do depends on the status of the updating transaction: if
+		 * it's live, we sleep until it finishes; if it has committed, we have
+		 * to fail (i.e. return HeapTupleUpdated); if it aborted, we ignore it.
+		 * For updates that didn't touch the key, we can just plough ahead.
+		 */
+		if (!(old_infomask & HEAP_XMAX_INVALID) &&
+			(mytup.t_data->t_infomask2 & HEAP_KEYS_UPDATED))
+		{
+			TransactionId	update_xid;
+
+			/*
+			 * Note: we *must* check TransactionIdIsInProgress before
+			 * TransactionIdDidAbort/Commit; see comment at top of tqual.c for
+			 * an explanation.
+			 */
+			update_xid = HeapTupleHeaderGetUpdateXid(mytup.t_data);
+			if (TransactionIdIsCurrentTransactionId(update_xid))
+			{
+				UnlockReleaseBuffer(buf);
+				return HeapTupleSelfUpdated;
+			}
+			else if (TransactionIdIsInProgress(update_xid))
+			{
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				/* No LockTupleTuplock here -- see heap_lock_updated_tuple */
+				XactLockTableWait(update_xid);
+				goto l4;
+			}
+			else if (TransactionIdDidAbort(update_xid))
+				;	/* okay to proceed */
+			else if (TransactionIdDidCommit(update_xid))
+			{
+				UnlockReleaseBuffer(buf);
+				return HeapTupleUpdated;
+			}
+		}
+
+		/* compute the new Xmax and infomask values for the tuple ... */
+		compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
+								  xid, mode, false,
+								  &new_xmax, &new_infomask, &new_infomask2);
+
+		START_CRIT_SECTION();
+
+		/* ... and set them */
+		HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
+		mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
+		mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+		mytup.t_data->t_infomask |= new_infomask;
+		mytup.t_data->t_infomask2 |= new_infomask2;
+
+		MarkBufferDirty(buf);
+
+		/* XLOG stuff */
+		if (RelationNeedsWAL(rel))
+		{
+			xl_heap_lock_updated xlrec;
+			XLogRecPtr	recptr;
+			XLogRecData	rdata[2];
+			Page		page = BufferGetPage(buf);
+
+			xlrec.target.node = rel->rd_node;
+			xlrec.target.tid = mytup.t_self;
+			xlrec.xmax = new_xmax;
+			xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
+
+			rdata[0].data = (char *) &xlrec;
+			rdata[0].len = SizeOfHeapLockUpdated;
+			rdata[0].buffer = InvalidBuffer;
+			rdata[0].next = &(rdata[1]);
+
+			rdata[1].data = NULL;
+			rdata[1].len = 0;
+			rdata[1].buffer = buf;
+			rdata[1].buffer_std = true;
+			rdata[1].next = NULL;
+
+			recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED, rdata);
+
+			PageSetLSN(page, recptr);
+			PageSetTLI(page, ThisTimeLineID);
+		}
+
+		END_CRIT_SECTION();
+
+		/* if we find the end of update chain, we're done. */
+		if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
+			ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid)  ||
+			HeapTupleHeaderIsOnlyLocked(mytup.t_data))
+		{
+			UnlockReleaseBuffer(buf);
+			return HeapTupleMayBeUpdated;
+		}
 
+		/* tail recursion */
+		ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
+		UnlockReleaseBuffer(buf);
+	}
+}
+
+/*
+ * heap_lock_updated_tuple
+ * 		Follow update chain when locking an updated tuple, acquiring locks (row
+ * 		marks) on the updated versions.
+ *
+ * The initial tuple is assumed to be already locked.
+ *
+ * This function doesn't check visibility, it just inconditionally marks the
+ * tuple(s) as locked.  If any tuple in the updated chain is being deleted
+ * concurrently (or updated with the key being modified), sleep until the
+ * transaction doing it is finished.
+ *
+ * Note that we don't acquire heavyweight tuple locks on the tuples we walk
+ * when we have to wait for other transactions to release them, as opposed to
+ * what heap_lock_tuple does.  The reason is that having more than one
+ * transaction walking the chain is probably uncommon enough that risk of
+ * starvation is not likely: one of the preconditions for being here is that
+ * the snapshot in use predates the update that created this tuple (because we
+ * started at an earlier version of the tuple), but at the same time such a
+ * transaction cannot be using repeatable read or serializable isolation
+ * levels, because that would lead to a serializability failure.
+ */
+static HTSU_Result
+heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
+						TransactionId xid, LockTupleMode mode)
+{
+	if (!ItemPointerEquals(&tuple->t_self, ctid))
+	{
+		/*
+		 * If this is the first possibly-multixact-able operation in the
+		 * current transaction, set my per-backend OldestMemberMXactId setting.
+		 * We can be certain that the transaction will never become a member of
+		 * any older MultiXactIds than that.  (We have to do this even if we
+		 * end up just using our own TransactionId below, since some other
+		 * backend could incorporate our XID into a MultiXact immediately
+		 * afterwards.)
+		 */
+		MultiXactIdSetOldestMember();
+
+		return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
+	}
+
+	/* nothing to lock */
 	return HeapTupleMayBeUpdated;
 }
 
@@ -4010,6 +5049,9 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
  * because this function is applied during WAL recovery, when we don't have
  * access to any such state, and can't depend on the hint bits to be set.)
  *
+ * Similarly, cutoff_multi must be less than or equal to the smallest
+ * MultiXactId used by any transaction currently open.
+ *
  * If the tuple is in a shared buffer, caller must hold an exclusive lock on
  * that buffer.
  *
@@ -4023,7 +5065,8 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
  * infomask bits.
  */
 bool
-heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid)
+heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
+				  MultiXactId cutoff_multi)
 {
 	bool		changed = false;
 	TransactionId xid;
@@ -4043,43 +5086,29 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid)
 		changed = true;
 	}
 
-	if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+	/*
+	 * Note that this code handles IS_MULTI Xmax values, too, but only to mark
+	 * the tuple frozen if the updating Xid in the mxact is below the freeze
+	 * cutoff; it doesn't remove dead members of a very old multixact.
+	 */
+	xid = HeapTupleHeaderGetRawXmax(tuple);
+	if (TransactionIdIsNormal(xid) &&
+		(((!(tuple->t_infomask & HEAP_XMAX_IS_MULTI) &&
+		   TransactionIdPrecedes(xid, cutoff_xid))) ||
+		 MultiXactIdPrecedes(xid, cutoff_multi)))
 	{
-		xid = HeapTupleHeaderGetXmax(tuple);
-		if (TransactionIdIsNormal(xid) &&
-			TransactionIdPrecedes(xid, cutoff_xid))
-		{
-			HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
+		HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
 
-			/*
-			 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED
-			 * + LOCKED.  Normalize to INVALID just to be sure no one gets
-			 * confused.
-			 */
-			tuple->t_infomask &= ~HEAP_XMAX_COMMITTED;
-			tuple->t_infomask |= HEAP_XMAX_INVALID;
-			HeapTupleHeaderClearHotUpdated(tuple);
-			changed = true;
-		}
-	}
-	else
-	{
-		/*----------
-		 * XXX perhaps someday we should zero out very old MultiXactIds here?
-		 *
-		 * The only way a stale MultiXactId could pose a problem is if a
-		 * tuple, having once been multiply-share-locked, is not touched by
-		 * any vacuum or attempted lock or deletion for just over 4G MultiXact
-		 * creations, and then in the probably-narrow window where its xmax
-		 * is again a live MultiXactId, someone tries to lock or delete it.
-		 * Even then, another share-lock attempt would work fine.  An
-		 * exclusive-lock or delete attempt would face unexpected delay, or
-		 * in the very worst case get a deadlock error.  This seems an
-		 * extremely low-probability scenario with minimal downside even if
-		 * it does happen, so for now we don't do the extra bookkeeping that
-		 * would be needed to clean out MultiXactIds.
-		 *----------
+		/*
+		 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED
+		 * + LOCKED.  Normalize to INVALID just to be sure no one gets
+		 * confused.  Also get rid of the HEAP_KEYS_UPDATED bit.
 		 */
+		tuple->t_infomask &= ~HEAP_XMAX_BITS;
+		tuple->t_infomask |= HEAP_XMAX_INVALID;
+		HeapTupleHeaderClearHotUpdated(tuple);
+		tuple->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+		changed = true;
 	}
 
 	/*
@@ -4115,18 +5144,269 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid)
 	return changed;
 }
 
+/*
+ * For a given MultiXactId, return the hint bits that should be set in the
+ * tuple's infomask.
+ *
+ * Normally this should be called for a multixact that was just created, and
+ * so is on our local cache, so the GetMembers call is fast.
+ */
+static void
+GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
+					   uint16 *new_infomask2)
+{
+	int		nmembers;
+	MultiXactMember	*members;
+	int		i;
+	uint16	bits = HEAP_XMAX_IS_MULTI;
+	uint16	bits2 = 0;
+	bool	has_update = false;
+
+	/*
+	 * We only use this in multis we just created, so they cannot be values
+	 * pre-pg_upgrade.
+	 */
+	nmembers = GetMultiXactIdMembers(multi, &members, false);
+
+	for (i = 0; i < nmembers; i++)
+	{
+		switch (members[i].status)
+		{
+			case MultiXactStatusForKeyShare:
+				bits |= HEAP_XMAX_KEYSHR_LOCK;
+				break;
+			case MultiXactStatusForShare:
+				bits |= HEAP_XMAX_SHR_LOCK;
+				break;
+			case MultiXactStatusForNoKeyUpdate:
+				bits |= HEAP_XMAX_EXCL_LOCK;
+				break;
+			case MultiXactStatusForUpdate:
+				bits |= HEAP_XMAX_EXCL_LOCK;
+				bits2 |= HEAP_KEYS_UPDATED;
+				break;
+			case MultiXactStatusNoKeyUpdate:
+				bits |= HEAP_XMAX_EXCL_LOCK;
+				has_update = true;
+				break;
+			case MultiXactStatusUpdate:
+				bits |= HEAP_XMAX_EXCL_LOCK;
+				bits2 |= HEAP_KEYS_UPDATED;
+				has_update = true;
+				break;
+		}
+	}
+	if (!has_update)
+		bits |= HEAP_XMAX_LOCK_ONLY;
+
+	if (nmembers > 0)
+		pfree(members);
+
+	*new_infomask = bits;
+	*new_infomask2 = bits2;
+}
+
+/*
+ * MultiXactIdGetUpdateXid
+ *
+ * Given a multixact Xmax and corresponding infomask, which does not have the
+ * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
+ * transaction.
+ */
+static TransactionId
+MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
+{
+	TransactionId	update_xact = InvalidTransactionId;
+	MultiXactMember	*members;
+	int				nmembers;
+
+	Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
+	Assert(t_infomask & HEAP_XMAX_IS_MULTI);
+
+	/*
+	 * Since we know the LOCK_ONLY bit is not set, this cannot be a
+	 * multi from pre-pg_upgrade.
+	 */
+	nmembers = GetMultiXactIdMembers(xmax, &members, false);
+
+	if (nmembers > 0)
+	{
+		int		i;
+
+		for (i = 0; i < nmembers; i++)
+		{
+			/* Ignore lockers */
+			if (members[i].status == MultiXactStatusForKeyShare ||
+				members[i].status == MultiXactStatusForShare ||
+				members[i].status == MultiXactStatusForNoKeyUpdate ||
+				members[i].status == MultiXactStatusForUpdate)
+				continue;
+
+			/* ignore aborted transactions */
+			if (TransactionIdDidAbort(members[i].xid))
+				continue;
+			/* there should be at most one non-aborted updater */
+			Assert(update_xact == InvalidTransactionId);
+			Assert(members[i].status == MultiXactStatusNoKeyUpdate ||
+				   members[i].status == MultiXactStatusUpdate);
+			update_xact = members[i].xid;
+#ifndef USE_ASSERT_CHECKING
+			/*
+			 * in an assert-enabled build, walk the whole array to ensure
+			 * there's no other updater.
+			 */
+			break;
+#endif
+		}
+
+		pfree(members);
+	}
+
+	return update_xact;
+}
+
+/*
+ * HeapTupleGetUpdateXid
+ * 		As above, but use a HeapTupleHeader
+ *
+ * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
+ * checking the hint bits.
+ */
+TransactionId
+HeapTupleGetUpdateXid(HeapTupleHeader tuple)
+{
+	return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
+								   tuple->t_infomask);
+}
+
+/*
+ * Do_MultiXactIdWait
+ * 		Actual implementation for the two functions below.
+ *
+ * We do this by sleeping on each member using XactLockTableWait.  Any
+ * members that belong to the current backend are *not* waited for, however;
+ * this would not merely be useless but would lead to Assert failure inside
+ * XactLockTableWait.  By the time this returns, it is certain that all
+ * transactions *of other backends* that were members of the MultiXactId
+ * that conflict with the requested status are dead (and no new ones can have
+ * been added, since it is not legal to add members to an existing
+ * MultiXactId).
+ *
+ * But by the time we finish sleeping, someone else may have changed the Xmax
+ * of the containing tuple, so the caller needs to iterate on us somehow.
+ *
+ * Note that in case we return false, the number of remaining members is
+ * not to be trusted.
+ */
+static bool
+Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+				   int *remaining, uint16 infomask, bool nowait)
+{
+	bool		allow_old;
+	bool		result = true;
+	MultiXactMember *members;
+	int			nmembers;
+	int			remain = 0;
+
+	allow_old = !(infomask & HEAP_LOCK_MASK) && HEAP_XMAX_IS_LOCKED_ONLY(infomask);
+	nmembers = GetMultiXactIdMembers(multi, &members, allow_old);
+
+	if (nmembers >= 0)
+	{
+		int			i;
+
+		for (i = 0; i < nmembers; i++)
+		{
+			TransactionId memxid = members[i].xid;
+			MultiXactStatus memstatus = members[i].status;
+
+			if (TransactionIdIsCurrentTransactionId(memxid))
+			{
+				remain++;
+				continue;
+			}
+
+			if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
+									 LOCKMODE_from_mxstatus(status)))
+			{
+				if (remaining && TransactionIdIsInProgress(memxid))
+					remain++;
+				continue;
+			}
+
+			/*
+			 * This member conflicts with our multi, so we have to sleep (or
+			 * return failure, if asked to avoid waiting.)
+			 */
+			if (nowait)
+			{
+				result = ConditionalXactLockTableWait(memxid);
+				if (!result)
+					break;
+			}
+			else
+				XactLockTableWait(memxid);
+		}
+
+		pfree(members);
+	}
+
+	if (remaining)
+		*remaining = remain;
+
+	return result;
+}
+
+/*
+ * MultiXactIdWait
+ *		Sleep on a MultiXactId.
+ *
+ * By the time we finish sleeping, someone else may have changed the Xmax
+ * of the containing tuple, so the caller needs to iterate on us somehow.
+ *
+ * We return (in *remaining, if not NULL) the number of members that are still
+ * running, including any (non-aborted) subtransactions of our own transaction.
+ *
+ */
+static void
+MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+				int *remaining, uint16 infomask)
+{
+	Do_MultiXactIdWait(multi, status, remaining, infomask, false);
+}
+
+/*
+ * ConditionalMultiXactIdWait
+ *		As above, but only lock if we can get the lock without blocking.
+ *
+ * By the time we finish sleeping, someone else may have changed the Xmax
+ * of the containing tuple, so the caller needs to iterate on us somehow.
+ *
+ * If the multixact is now all gone, return true.  Returns false if some
+ * transactions might still be running.
+ *
+ * We return (in *remaining, if not NULL) the number of members that are still
+ * running, including any (non-aborted) subtransactions of our own transaction.
+ */
+static bool
+ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+						   int *remaining, uint16 infomask)
+{
+	return Do_MultiXactIdWait(multi, status, remaining, infomask, true);
+}
+
 /*
  * heap_tuple_needs_freeze
  *
  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
- * are older than the specified cutoff XID.  If so, return TRUE.
+ * are older than the specified cutoff XID or MultiXactId.  If so, return TRUE.
  *
  * It doesn't matter whether the tuple is alive or dead, we are checking
  * to see if a tuple needs to be removed or frozen to avoid wraparound.
  */
 bool
 heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
-						Buffer buf)
+						MultiXactId cutoff_multi, Buffer buf)
 {
 	TransactionId xid;
 
@@ -4135,12 +5415,23 @@ heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
 		TransactionIdPrecedes(xid, cutoff_xid))
 		return true;
 
-	if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+	if (!(tuple->t_infomask & HEAP_XMAX_INVALID))
 	{
-		xid = HeapTupleHeaderGetXmax(tuple);
-		if (TransactionIdIsNormal(xid) &&
-			TransactionIdPrecedes(xid, cutoff_xid))
-			return true;
+		if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+		{
+			xid = HeapTupleHeaderGetRawXmax(tuple);
+			if (TransactionIdIsNormal(xid) &&
+				TransactionIdPrecedes(xid, cutoff_xid))
+				return true;
+		}
+		else
+		{
+			MultiXactId multi;
+
+			multi = HeapTupleHeaderGetRawXmax(tuple);
+			if (MultiXactIdPrecedes(multi, cutoff_multi))
+				return true;
+		}
 	}
 
 	if (tuple->t_infomask & HEAP_MOVED)
@@ -4231,7 +5522,7 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
 									   TransactionId *latestRemovedXid)
 {
 	TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
-	TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
+	TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
 	TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
 
 	if (tuple->t_infomask & HEAP_MOVED)
@@ -4387,7 +5678,7 @@ log_heap_clean(Relation reln, Buffer buffer,
  */
 XLogRecPtr
 log_heap_freeze(Relation reln, Buffer buffer,
-				TransactionId cutoff_xid,
+				TransactionId cutoff_xid, MultiXactId cutoff_multi,
 				OffsetNumber *offsets, int offcnt)
 {
 	xl_heap_freeze xlrec;
@@ -4402,6 +5693,7 @@ log_heap_freeze(Relation reln, Buffer buffer,
 	xlrec.node = reln->rd_node;
 	xlrec.block = BufferGetBlockNumber(buffer);
 	xlrec.cutoff_xid = cutoff_xid;
+	xlrec.cutoff_multi = cutoff_multi;
 
 	rdata[0].data = (char *) &xlrec;
 	rdata[0].len = SizeOfHeapFreeze;
@@ -4463,8 +5755,8 @@ log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer,
  * have modified the buffer(s) and marked them dirty.
  */
 static XLogRecPtr
-log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
-				Buffer newbuf, HeapTuple newtup,
+log_heap_update(Relation reln, Buffer oldbuf,
+				Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
 				bool all_visible_cleared, bool new_all_visible_cleared)
 {
 	xl_heap_update xlrec;
@@ -4483,7 +5775,11 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
 		info = XLOG_HEAP_UPDATE;
 
 	xlrec.target.node = reln->rd_node;
-	xlrec.target.tid = from;
+	xlrec.target.tid = oldtup->t_self;
+	xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
+	xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
+											  oldtup->t_data->t_infomask2);
+	xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
 	xlrec.all_visible_cleared = all_visible_cleared;
 	xlrec.newtid = newtup->t_self;
 	xlrec.new_all_visible_cleared = new_all_visible_cleared;
@@ -4748,6 +6044,7 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
 {
 	xl_heap_freeze *xlrec = (xl_heap_freeze *) XLogRecGetData(record);
 	TransactionId cutoff_xid = xlrec->cutoff_xid;
+	MultiXactId	cutoff_multi = xlrec->cutoff_multi;
 	Buffer		buffer;
 	Page		page;
 
@@ -4790,7 +6087,7 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
 			ItemId		lp = PageGetItemId(page, *offsets);
 			HeapTupleHeader tuple = (HeapTupleHeader) PageGetItem(page, lp);
 
-			(void) heap_freeze_tuple(tuple, cutoff_xid);
+			(void) heap_freeze_tuple(tuple, cutoff_xid, cutoff_multi);
 			offsets++;
 		}
 	}
@@ -4937,6 +6234,33 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
 	UnlockReleaseBuffer(buffer);
 }
 
+/*
+ * Given an "infobits" field from an XLog record, set the correct bits in the
+ * given infomask and infomask2 for the tuple touched by the record.
+ *
+ * (This is the reverse of compute_infobits).
+ */
+static void
+fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
+{
+	*infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
+				   HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
+	*infomask2 &= ~HEAP_KEYS_UPDATED;
+
+	if (infobits & XLHL_XMAX_IS_MULTI)
+		*infomask |= HEAP_XMAX_IS_MULTI;
+	if (infobits & XLHL_XMAX_LOCK_ONLY)
+		*infomask |= HEAP_XMAX_LOCK_ONLY;
+	if (infobits & XLHL_XMAX_EXCL_LOCK)
+		*infomask |= HEAP_XMAX_EXCL_LOCK;
+	/* note HEAP_XMAX_SHR_LOCK isn't considered here */
+	if (infobits & XLHL_XMAX_KEYSHR_LOCK)
+		*infomask |= HEAP_XMAX_KEYSHR_LOCK;
+
+	if (infobits & XLHL_KEYS_UPDATED)
+		*infomask2 |= HEAP_KEYS_UPDATED;
+}
+
 static void
 heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 {
@@ -4992,13 +6316,12 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 
 	htup = (HeapTupleHeader) PageGetItem(page, lp);
 
-	htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
-						  HEAP_XMAX_INVALID |
-						  HEAP_XMAX_IS_MULTI |
-						  HEAP_IS_LOCKED |
-						  HEAP_MOVED);
+	htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+	htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
 	HeapTupleHeaderClearHotUpdated(htup);
-	HeapTupleHeaderSetXmax(htup, record->xl_xid);
+	fix_infomask_from_infobits(xlrec->infobits_set,
+							   &htup->t_infomask, &htup->t_infomask2);
+	HeapTupleHeaderSetXmax(htup, xlrec->xmax);
 	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
 
 	/* Mark the page as a candidate for pruning */
@@ -5368,16 +6691,15 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
 
 	htup = (HeapTupleHeader) PageGetItem(page, lp);
 
-	htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
-						  HEAP_XMAX_INVALID |
-						  HEAP_XMAX_IS_MULTI |
-						  HEAP_IS_LOCKED |
-						  HEAP_MOVED);
+	htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+	htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
 	if (hot_update)
 		HeapTupleHeaderSetHotUpdated(htup);
 	else
 		HeapTupleHeaderClearHotUpdated(htup);
-	HeapTupleHeaderSetXmax(htup, record->xl_xid);
+	fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
+							   &htup->t_infomask2);
+	HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
 	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
 	/* Set forward chain link in t_ctid */
 	htup->t_ctid = xlrec->newtid;
@@ -5484,6 +6806,7 @@ newsame:;
 
 	HeapTupleHeaderSetXmin(htup, record->xl_xid);
 	HeapTupleHeaderSetCmin(htup, FirstCommandId);
+	HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
 	/* Make sure there is no forward chain link in t_ctid */
 	htup->t_ctid = xlrec->newtid;
 
@@ -5564,17 +6887,8 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
 
 	htup = (HeapTupleHeader) PageGetItem(page, lp);
 
-	htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
-						  HEAP_XMAX_INVALID |
-						  HEAP_XMAX_IS_MULTI |
-						  HEAP_IS_LOCKED |
-						  HEAP_MOVED);
-	if (xlrec->xid_is_mxact)
-		htup->t_infomask |= HEAP_XMAX_IS_MULTI;
-	if (xlrec->shared_lock)
-		htup->t_infomask |= HEAP_XMAX_SHARED_LOCK;
-	else
-		htup->t_infomask |= HEAP_XMAX_EXCL_LOCK;
+	fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
+							   &htup->t_infomask2);
 	HeapTupleHeaderClearHotUpdated(htup);
 	HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
 	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
@@ -5586,6 +6900,56 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
 	UnlockReleaseBuffer(buffer);
 }
 
+static void
+heap_xlog_lock_updated(XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_heap_lock_updated *xlrec =
+		(xl_heap_lock_updated *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	OffsetNumber offnum;
+	ItemId		lp = NULL;
+	HeapTupleHeader htup;
+
+	/* If we have a full-page image, restore it and we're done */
+	if (record->xl_info & XLR_BKP_BLOCK(0))
+	{
+		(void) RestoreBackupBlock(lsn, record, 0, false, false);
+		return;
+	}
+
+	buffer = XLogReadBuffer(xlrec->target.node,
+							ItemPointerGetBlockNumber(&(xlrec->target.tid)),
+							false);
+	if (!BufferIsValid(buffer))
+		return;
+	page = (Page) BufferGetPage(buffer);
+
+	if (lsn <= PageGetLSN(page))		/* changes are applied */
+	{
+		UnlockReleaseBuffer(buffer);
+		return;
+	}
+
+	offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
+	if (PageGetMaxOffsetNumber(page) >= offnum)
+		lp = PageGetItemId(page, offnum);
+
+	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
+		elog(PANIC, "heap_xlog_lock_updated: invalid lp");
+
+	htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+	fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
+							   &htup->t_infomask2);
+	HeapTupleHeaderSetXmax(htup, xlrec->xmax);
+
+	PageSetLSN(page, lsn);
+	PageSetTLI(page, ThisTimeLineID);
+	MarkBufferDirty(buffer);
+	UnlockReleaseBuffer(buffer);
+}
+
 static void
 heap_xlog_inplace(XLogRecPtr lsn, XLogRecord *record)
 {
@@ -5702,6 +7066,9 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
 		case XLOG_HEAP2_MULTI_INSERT:
 			heap_xlog_multi_insert(lsn, record);
 			break;
+		case XLOG_HEAP2_LOCK_UPDATED:
+			heap_xlog_lock_updated(lsn, record);
+			break;
 		default:
 			elog(PANIC, "heap2_redo: unknown op code %u", info);
 	}
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 390585bd2e..3ca332d28f 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -463,7 +463,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 				 * that the page is reconsidered for pruning in future.
 				 */
 				heap_prune_record_prunable(prstate,
-										   HeapTupleHeaderGetXmax(htup));
+										   HeapTupleHeaderGetUpdateXid(htup));
 				break;
 
 			case HEAPTUPLE_DELETE_IN_PROGRESS:
@@ -473,7 +473,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 				 * that the page is reconsidered for pruning in future.
 				 */
 				heap_prune_record_prunable(prstate,
-										   HeapTupleHeaderGetXmax(htup));
+										   HeapTupleHeaderGetUpdateXid(htup));
 				break;
 
 			case HEAPTUPLE_LIVE:
@@ -521,7 +521,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 		Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
 			   BufferGetBlockNumber(buffer));
 		offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
-		priorXmax = HeapTupleHeaderGetXmax(htup);
+		priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 	}
 
 	/*
@@ -746,7 +746,7 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 
 			/* Set up to scan the HOT-chain */
 			nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
-			priorXmax = HeapTupleHeaderGetXmax(htup);
+			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 		}
 		else
 		{
@@ -787,7 +787,7 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 				break;
 
 			nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
-			priorXmax = HeapTupleHeaderGetXmax(htup);
+			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
 		}
 	}
 }
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 628e3b1277..84472f80cd 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -111,6 +111,7 @@
 #include "storage/smgr.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
+#include "utils/tqual.h"
 
 
 /*
@@ -128,6 +129,8 @@ typedef struct RewriteStateData
 										 * determine tuple visibility */
 	TransactionId rs_freeze_xid;/* Xid that will be used as freeze cutoff
 								 * point */
+	MultiXactId	rs_freeze_multi;/* MultiXactId that will be used as freeze
+								 * cutoff point for multixacts */
 	MemoryContext rs_cxt;		/* for hash tables and entries and tuples in
 								 * them */
 	HTAB	   *rs_unresolved_tups;		/* unmatched A tuples */
@@ -177,6 +180,7 @@ static void raw_heap_insert(RewriteState state, HeapTuple tup);
  * new_heap		new, locked heap relation to insert tuples to
  * oldest_xmin	xid used by the caller to determine which tuples are dead
  * freeze_xid	xid before which tuples will be frozen
+ * freeze_multi multixact before which multis will be frozen
  * use_wal		should the inserts to the new heap be WAL-logged?
  *
  * Returns an opaque RewriteState, allocated in current memory context,
@@ -184,7 +188,8 @@ static void raw_heap_insert(RewriteState state, HeapTuple tup);
  */
 RewriteState
 begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
-				   TransactionId freeze_xid, bool use_wal)
+				   TransactionId freeze_xid, MultiXactId freeze_multi,
+				   bool use_wal)
 {
 	RewriteState state;
 	MemoryContext rw_cxt;
@@ -213,6 +218,7 @@ begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
 	state->rs_use_wal = use_wal;
 	state->rs_oldest_xmin = oldest_xmin;
 	state->rs_freeze_xid = freeze_xid;
+	state->rs_freeze_multi = freeze_multi;
 	state->rs_cxt = rw_cxt;
 
 	/* Initialize hash tables used to track update chains */
@@ -337,7 +343,8 @@ rewrite_heap_tuple(RewriteState state,
 	 * While we have our hands on the tuple, we may as well freeze any
 	 * very-old xmin or xmax, so that future VACUUM effort can be saved.
 	 */
-	heap_freeze_tuple(new_tuple->t_data, state->rs_freeze_xid);
+	heap_freeze_tuple(new_tuple->t_data, state->rs_freeze_xid,
+					  state->rs_freeze_multi);
 
 	/*
 	 * Invalid ctid means that ctid should point to the tuple itself. We'll
@@ -348,15 +355,15 @@ rewrite_heap_tuple(RewriteState state,
 	/*
 	 * If the tuple has been updated, check the old-to-new mapping hash table.
 	 */
-	if (!(old_tuple->t_data->t_infomask & (HEAP_XMAX_INVALID |
-										   HEAP_IS_LOCKED)) &&
+	if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
+		  HeapTupleHeaderIsOnlyLocked(old_tuple->t_data)) &&
 		!(ItemPointerEquals(&(old_tuple->t_self),
 							&(old_tuple->t_data->t_ctid))))
 	{
 		OldToNewMapping mapping;
 
 		memset(&hashkey, 0, sizeof(hashkey));
-		hashkey.xmin = HeapTupleHeaderGetXmax(old_tuple->t_data);
+		hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data);
 		hashkey.tid = old_tuple->t_data->t_ctid;
 
 		mapping = (OldToNewMapping)
diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c
index 3809e51166..272208417a 100644
--- a/src/backend/access/rmgrdesc/heapdesc.c
+++ b/src/backend/access/rmgrdesc/heapdesc.c
@@ -25,6 +25,21 @@ out_target(StringInfo buf, xl_heaptid *target)
 					 ItemPointerGetOffsetNumber(&(target->tid)));
 }
 
+static void
+out_infobits(StringInfo buf, uint8 infobits)
+{
+	if (infobits & XLHL_XMAX_IS_MULTI)
+		appendStringInfo(buf, "IS_MULTI ");
+	if (infobits & XLHL_XMAX_LOCK_ONLY)
+		appendStringInfo(buf, "LOCK_ONLY ");
+	if (infobits & XLHL_XMAX_EXCL_LOCK)
+		appendStringInfo(buf, "EXCL_LOCK ");
+	if (infobits & XLHL_XMAX_KEYSHR_LOCK)
+		appendStringInfo(buf, "KEYSHR_LOCK ");
+	if (infobits & XLHL_KEYS_UPDATED)
+		appendStringInfo(buf, "KEYS_UPDATED ");
+}
+
 void
 heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 {
@@ -47,6 +62,8 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 
 		appendStringInfo(buf, "delete: ");
 		out_target(buf, &(xlrec->target));
+		appendStringInfoChar(buf, ' ');
+		out_infobits(buf, xlrec->infobits_set);
 	}
 	else if (info == XLOG_HEAP_UPDATE)
 	{
@@ -57,9 +74,12 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 		else
 			appendStringInfo(buf, "update: ");
 		out_target(buf, &(xlrec->target));
-		appendStringInfo(buf, "; new %u/%u",
+		appendStringInfo(buf, " xmax %u ", xlrec->old_xmax);
+		out_infobits(buf, xlrec->old_infobits_set);
+		appendStringInfo(buf, "; new tid %u/%u xmax %u",
 						 ItemPointerGetBlockNumber(&(xlrec->newtid)),
-						 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
+						 ItemPointerGetOffsetNumber(&(xlrec->newtid)),
+						 xlrec->new_xmax);
 	}
 	else if (info == XLOG_HEAP_HOT_UPDATE)
 	{
@@ -70,9 +90,12 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 		else
 			appendStringInfo(buf, "hot_update: ");
 		out_target(buf, &(xlrec->target));
-		appendStringInfo(buf, "; new %u/%u",
+		appendStringInfo(buf, " xmax %u ", xlrec->old_xmax);
+		out_infobits(buf, xlrec->old_infobits_set);
+		appendStringInfo(buf, "; new tid %u/%u xmax %u",
 						 ItemPointerGetBlockNumber(&(xlrec->newtid)),
-						 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
+						 ItemPointerGetOffsetNumber(&(xlrec->newtid)),
+						 xlrec->new_xmax);
 	}
 	else if (info == XLOG_HEAP_NEWPAGE)
 	{
@@ -87,16 +110,10 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 	{
 		xl_heap_lock *xlrec = (xl_heap_lock *) rec;
 
-		if (xlrec->shared_lock)
-			appendStringInfo(buf, "shared_lock: ");
-		else
-			appendStringInfo(buf, "exclusive_lock: ");
-		if (xlrec->xid_is_mxact)
-			appendStringInfo(buf, "mxid ");
-		else
-			appendStringInfo(buf, "xid ");
-		appendStringInfo(buf, "%u ", xlrec->locking_xid);
+		appendStringInfo(buf, "lock %u: ", xlrec->locking_xid);
 		out_target(buf, &(xlrec->target));
+		appendStringInfoChar(buf, ' ');
+		out_infobits(buf, xlrec->infobits_set);
 	}
 	else if (info == XLOG_HEAP_INPLACE)
 	{
@@ -108,7 +125,6 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 	else
 		appendStringInfo(buf, "UNKNOWN");
 }
-
 void
 heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 {
@@ -119,10 +135,10 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 	{
 		xl_heap_freeze *xlrec = (xl_heap_freeze *) rec;
 
-		appendStringInfo(buf, "freeze: rel %u/%u/%u; blk %u; cutoff %u",
+		appendStringInfo(buf, "freeze: rel %u/%u/%u; blk %u; cutoff xid %u multi %u",
 						 xlrec->node.spcNode, xlrec->node.dbNode,
 						 xlrec->node.relNode, xlrec->block,
-						 xlrec->cutoff_xid);
+						 xlrec->cutoff_xid, xlrec->cutoff_multi);
 	}
 	else if (info == XLOG_HEAP2_CLEAN)
 	{
@@ -160,6 +176,14 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 				xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode,
 						 xlrec->blkno, xlrec->ntuples);
 	}
+	else if (info == XLOG_HEAP2_LOCK_UPDATED)
+	{
+		xl_heap_lock_updated *xlrec = (xl_heap_lock_updated *) rec;
+
+		appendStringInfo(buf, "lock updated: xmax %u msk %04x; ", xlrec->xmax,
+						 xlrec->infobits_set);
+		out_target(buf, &(xlrec->target));
+	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
 }
diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c
index ddd675f610..3e6cba062d 100644
--- a/src/backend/access/rmgrdesc/mxactdesc.c
+++ b/src/backend/access/rmgrdesc/mxactdesc.c
@@ -16,6 +16,35 @@
 
 #include "access/multixact.h"
 
+static void
+out_member(StringInfo buf, MultiXactMember *member)
+{
+	appendStringInfo(buf, "%u ", member->xid);
+	switch (member->status)
+	{
+		case MultiXactStatusForKeyShare:
+			appendStringInfoString(buf, "(keysh) ");
+			break;
+		case MultiXactStatusForShare:
+			appendStringInfoString(buf, "(sh) ");
+			break;
+		case MultiXactStatusForNoKeyUpdate:
+			appendStringInfoString(buf, "(fornokeyupd) ");
+			break;
+		case MultiXactStatusForUpdate:
+			appendStringInfoString(buf, "(forupd) ");
+			break;
+		case MultiXactStatusNoKeyUpdate:
+			appendStringInfoString(buf, "(nokeyupd) ");
+			break;
+		case MultiXactStatusUpdate:
+			appendStringInfoString(buf, "(upd) ");
+			break;
+		default:
+			appendStringInfoString(buf, "(unk) ");
+			break;
+	}
+}
 
 void
 multixact_desc(StringInfo buf, uint8 xl_info, char *rec)
@@ -41,10 +70,10 @@ multixact_desc(StringInfo buf, uint8 xl_info, char *rec)
 		xl_multixact_create *xlrec = (xl_multixact_create *) rec;
 		int			i;
 
-		appendStringInfo(buf, "create multixact %u offset %u:",
-						 xlrec->mid, xlrec->moff);
-		for (i = 0; i < xlrec->nxids; i++)
-			appendStringInfo(buf, " %u", xlrec->xids[i]);
+		appendStringInfo(buf, "create mxid %u offset %u nmembers %d: ", xlrec->mid,
+						 xlrec->moff, xlrec->nmembers);
+		for (i = 0; i < xlrec->nmembers; i++)
+			out_member(buf, &xlrec->members[i]);
 	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index ad0abbfe8b..506b208c9c 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -41,7 +41,8 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
 
 		appendStringInfo(buf, "checkpoint: redo %X/%X; "
 				   "tli %u; fpw %s; xid %u/%u; oid %u; multi %u; offset %u; "
-						 "oldest xid %u in DB %u; oldest running xid %u; %s",
+						 "oldest xid %u in DB %u; oldest multi %u in DB %u; "
+						 "oldest running xid %u; %s",
 						 (uint32) (checkpoint->redo >> 32), (uint32) checkpoint->redo,
 						 checkpoint->ThisTimeLineID,
 						 checkpoint->fullPageWrites ? "true" : "false",
@@ -51,6 +52,8 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 checkpoint->nextMultiOffset,
 						 checkpoint->oldestXid,
 						 checkpoint->oldestXidDB,
+						 checkpoint->oldestMulti,
+						 checkpoint->oldestMultiDB,
 						 checkpoint->oldestActiveXid,
 				 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
 	}
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index 548ddbb4dd..aabcbba49e 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -791,10 +791,10 @@ parent transaction to complete.
 
 Not all transactional behaviour is emulated, for example we do not insert
 a transaction entry into the lock table, nor do we maintain the transaction
-stack in memory. Clog entries are made normally. Multitrans is not maintained
+stack in memory. Clog entries are made normally. Multixact is not maintained
 because its purpose is to record tuple level locks that an application has
-requested to prevent write locks. Since write locks cannot be obtained at all,
-there is never any conflict and so there is no reason to update multitrans.
+requested to prevent other tuple locks. Since tuple locks cannot be obtained at
+all, there is never any conflict and so there is no reason to update multixact.
 Subtrans is maintained during recovery but the details of the transaction
 tree are ignored and all subtransactions reference the top-level TransactionId
 directly. Since commit is atomic this provides correct lock wait behaviour
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 1ae671743c..9f804f7599 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -3,12 +3,18 @@
  * multixact.c
  *		PostgreSQL multi-transaction-log manager
  *
- * The pg_multixact manager is a pg_clog-like manager that stores an array
- * of TransactionIds for each MultiXactId.	It is a fundamental part of the
- * shared-row-lock implementation.	A share-locked tuple stores a
- * MultiXactId in its Xmax, and a transaction that needs to wait for the
- * tuple to be unlocked can sleep on the potentially-several TransactionIds
- * that compose the MultiXactId.
+ * The pg_multixact manager is a pg_clog-like manager that stores an array of
+ * MultiXactMember for each MultiXactId.  It is a fundamental part of the
+ * shared-row-lock implementation.  Each MultiXactMember is comprised of a
+ * TransactionId and a set of flag bits.  The name is a bit historical:
+ * originally, a MultiXactId consisted of more than one TransactionId (except
+ * in rare corner cases), hence "multi".  Nowadays, however, it's perfectly
+ * legitimate to have MultiXactIds that only include a single Xid.
+ *
+ * The meaning of the flag bits is opaque to this module, but they are mostly
+ * used in heapam.c to identify lock modes that each of the member transactions
+ * is holding on any given tuple.  This module just contains support to store
+ * and retrieve the arrays.
  *
  * We use two SLRU areas, one for storing the offsets at which the data
  * starts for each MultiXactId in the other one.  This trick allows us to
@@ -38,6 +44,15 @@
  * replay, the next-MXID and next-offset counters are at least as large as
  * anything we saw during replay.
  *
+ * We are able to remove segments no longer necessary by carefully tracking
+ * each table's used values: during vacuum, any multixact older than a
+ * certain value is removed; the cutoff value is stored in pg_class.
+ * The minimum value in each database is stored in pg_database, and the
+ * global minimum is part of pg_control.  Any vacuum that is able to
+ * advance its database's minimum value also computes a new global minimum,
+ * and uses this value to truncate older segments.  When new multixactid
+ * values are to be created, care is taken that the counter does not
+ * fall within the wraparound horizon considering the global minimum value.
  *
  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -54,40 +69,84 @@
 #include "access/twophase.h"
 #include "access/twophase_rmgr.h"
 #include "access/xact.h"
+#include "catalog/pg_type.h"
+#include "commands/dbcommands.h"
+#include "funcapi.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "storage/lmgr.h"
+#include "storage/pmsignal.h"
 #include "storage/procarray.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
+#include "utils/snapmgr.h"
 
 
 /*
  * Defines for MultiXactOffset page sizes.	A page is the same BLCKSZ as is
  * used everywhere else in Postgres.
  *
- * Note: because both MultiXactOffsets and TransactionIds are 32 bits and
- * wrap around at 0xFFFFFFFF, MultiXact page numbering also wraps around at
- * 0xFFFFFFFF/MULTIXACT_*_PER_PAGE, and segment numbering at
- * 0xFFFFFFFF/MULTIXACT_*_PER_PAGE/SLRU_SEGMENTS_PER_PAGE.	We need take no
- * explicit notice of that fact in this module, except when comparing segment
- * and page numbers in TruncateMultiXact
- * (see MultiXact{Offset,Member}PagePrecedes).
+ * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
+ * MultiXact page numbering also wraps around at
+ * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
+ * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_SEGMENTS_PER_PAGE.	We need
+ * take no explicit notice of that fact in this module, except when comparing
+ * segment and page numbers in TruncateMultiXact (see
+ * MultiXactOffsetPagePrecedes).
  */
 
-/* We need four bytes per offset and also four bytes per member */
+/* We need four bytes per offset */
 #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
-#define MULTIXACT_MEMBERS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
 
 #define MultiXactIdToOffsetPage(xid) \
 	((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
 #define MultiXactIdToOffsetEntry(xid) \
 	((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
 
-#define MXOffsetToMemberPage(xid) \
-	((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
-#define MXOffsetToMemberEntry(xid) \
-	((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
+/*
+ * The situation for members is a bit more complex: we store one byte of
+ * additional flag bits for each TransactionId.  To do this without getting
+ * into alignment issues, we store four bytes of flags, and then the
+ * corresponding 4 Xids.  Each such 5-word (20-byte) set we call a "group", and
+ * are stored as a whole in pages.  Thus, with 8kB BLCKSZ, we keep 409 groups
+ * per page.  This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * performance) trumps space efficiency here.
+ *
+ * Note that the "offset" macros work with byte offset, not array indexes, so
+ * arithmetic must be done using "char *" pointers.
+ */
+/* We need eight bits per xact, so one xact fits in a byte */
+#define MXACT_MEMBER_BITS_PER_XACT			8
+#define MXACT_MEMBER_FLAGS_PER_BYTE			1
+#define MXACT_MEMBER_XACT_BITMASK	((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
+
+/* how many full bytes of flags are there in a group? */
+#define MULTIXACT_FLAGBYTES_PER_GROUP		4
+#define MULTIXACT_MEMBERS_PER_MEMBERGROUP	\
+	(MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
+/* size in bytes of a complete group */
+#define MULTIXACT_MEMBERGROUP_SIZE \
+	(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERS_PER_PAGE	\
+	(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
+
+/* page in which a member is to be found */
+#define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
+
+/* Location (byte offset within page) of flag word for a given member */
+#define MXOffsetToFlagsOffset(xid) \
+	((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \
+	  (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \
+	 (TransactionId) MULTIXACT_MEMBERGROUP_SIZE)
+#define MXOffsetToFlagsBitShift(xid) \
+	(((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \
+	 MXACT_MEMBER_BITS_PER_XACT)
+
+/* Location (byte offset within page) of TransactionId of given member */
+#define MXOffsetToMemberOffset(xid) \
+	(MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \
+	 ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId))
 
 
 /*
@@ -117,6 +176,19 @@ typedef struct MultiXactStateData
 	/* the Offset SLRU area was last truncated at this MultiXactId */
 	MultiXactId lastTruncationPoint;
 
+	/*
+	 * oldest multixact that is still on disk.  Anything older than this should
+	 * not be consulted.
+	 */
+	MultiXactId		oldestMultiXactId;
+	Oid				oldestMultiXactDB;
+
+	/* support for anti-wraparound measures */
+	MultiXactId		multiVacLimit;
+	MultiXactId		multiWarnLimit;
+	MultiXactId		multiStopLimit;
+	MultiXactId		multiWrapLimit;
+
 	/*
 	 * Per-backend data starts here.  We have two arrays stored in the area
 	 * immediately following the MultiXactStateData struct. Each is indexed by
@@ -180,7 +252,8 @@ static MultiXactId *OldestVisibleMXactId;
  * so they will be uninteresting by the time our next transaction starts.
  * (XXX not clear that this is correct --- other members of the MultiXact
  * could hang around longer than we did.  However, it's not clear what a
- * better policy for flushing old cache entries would be.)
+ * better policy for flushing old cache entries would be.)  FIXME actually
+ * this is plain wrong now that multixact's may contain update Xids.
  *
  * We allocate the cache entries in a memory context that is deleted at
  * transaction end, so we don't need to do retail freeing of entries.
@@ -189,53 +262,52 @@ typedef struct mXactCacheEnt
 {
 	struct mXactCacheEnt *next;
 	MultiXactId multi;
-	int			nxids;
-	TransactionId xids[1];		/* VARIABLE LENGTH ARRAY */
+	int			nmembers;
+	MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
 } mXactCacheEnt;
 
 static mXactCacheEnt *MXactCache = NULL;
 static MemoryContext MXactContext = NULL;
 
-
 #ifdef MULTIXACT_DEBUG
 #define debug_elog2(a,b) elog(a,b)
 #define debug_elog3(a,b,c) elog(a,b,c)
 #define debug_elog4(a,b,c,d) elog(a,b,c,d)
 #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
+#define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
 #else
 #define debug_elog2(a,b)
 #define debug_elog3(a,b,c)
 #define debug_elog4(a,b,c,d)
 #define debug_elog5(a,b,c,d,e)
+#define debug_elog6(a,b,c,d,e,f)
 #endif
 
 /* internal MultiXactId management */
 static void MultiXactIdSetOldestVisible(void);
-static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids);
+static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members);
 static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
-				   int nxids, TransactionId *xids);
-static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset);
+				   int nmembers, MultiXactMember *members);
+static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
 
 /* MultiXact cache management */
-static MultiXactId mXactCacheGetBySet(int nxids, TransactionId *xids);
-static int	mXactCacheGetById(MultiXactId multi, TransactionId **xids);
-static void mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids);
+static int mxactMemberComparator(const void *arg1, const void *arg2);
+static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
+static int	mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
+static void mXactCachePut(MultiXactId multi, int nmembers,
+			  MultiXactMember *members);
 
-#ifdef MULTIXACT_DEBUG
-static char *mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids);
-#endif
+static char *mxstatus_to_string(MultiXactStatus status);
 
 /* management of SLRU infrastructure */
 static int	ZeroMultiXactOffsetPage(int pageno, bool writeXlog);
 static int	ZeroMultiXactMemberPage(int pageno, bool writeXlog);
 static bool MultiXactOffsetPagePrecedes(int page1, int page2);
 static bool MultiXactMemberPagePrecedes(int page1, int page2);
-static bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2);
 static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
 						MultiXactOffset offset2);
 static void ExtendMultiXactOffset(MultiXactId multi);
 static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
-static void TruncateMultiXact(void);
 static void WriteMZeroPageXlogRec(int pageno, uint8 info);
 
 
@@ -243,21 +315,22 @@ static void WriteMZeroPageXlogRec(int pageno, uint8 info);
  * MultiXactIdCreate
  *		Construct a MultiXactId representing two TransactionIds.
  *
- * The two XIDs must be different.
+ * The two XIDs must be different, or be requesting different statuses.
  *
  * NB - we don't worry about our local MultiXactId cache here, because that
  * is handled by the lower-level routines.
  */
 MultiXactId
-MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
+MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
+				  TransactionId xid2, MultiXactStatus status2)
 {
 	MultiXactId newMulti;
-	TransactionId xids[2];
+	MultiXactMember members[2];
 
 	AssertArg(TransactionIdIsValid(xid1));
 	AssertArg(TransactionIdIsValid(xid2));
 
-	Assert(!TransactionIdEquals(xid1, xid2));
+	Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
 
 	/*
 	 * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
@@ -265,13 +338,15 @@ MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
 	 * caller just did a check on xid1, so it'd be wasted effort.
 	 */
 
-	xids[0] = xid1;
-	xids[1] = xid2;
+	members[0].xid = xid1;
+	members[0].status = status1;
+	members[1].xid = xid2;
+	members[1].status = status2;
 
-	newMulti = CreateMultiXactId(2, xids);
+	newMulti = CreateMultiXactId(2, members);
 
-	debug_elog5(DEBUG2, "Create: returning %u for %u, %u",
-				newMulti, xid1, xid2);
+	debug_elog3(DEBUG2, "Create: %s",
+				mxid_to_string(newMulti, 2, members));
 
 	return newMulti;
 }
@@ -280,22 +355,27 @@ MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
  * MultiXactIdExpand
  *		Add a TransactionId to a pre-existing MultiXactId.
  *
- * If the TransactionId is already a member of the passed MultiXactId,
- * just return it as-is.
+ * If the TransactionId is already a member of the passed MultiXactId with the
+ * same status, just return it as-is.
  *
  * Note that we do NOT actually modify the membership of a pre-existing
  * MultiXactId; instead we create a new one.  This is necessary to avoid
- * a race condition against MultiXactIdWait (see notes there).
+ * a race condition against code trying to wait for one MultiXactId to finish;
+ * see notes in heapam.c.
  *
  * NB - we don't worry about our local MultiXactId cache here, because that
  * is handled by the lower-level routines.
+ *
+ * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
+ * one upgraded by pg_upgrade from a cluster older than this feature) are not
+ * passed in.
  */
 MultiXactId
-MultiXactIdExpand(MultiXactId multi, TransactionId xid)
+MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
 {
 	MultiXactId newMulti;
-	TransactionId *members;
-	TransactionId *newMembers;
+	MultiXactMember *members;
+	MultiXactMember *newMembers;
 	int			nmembers;
 	int			i;
 	int			j;
@@ -303,13 +383,20 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
 	AssertArg(MultiXactIdIsValid(multi));
 	AssertArg(TransactionIdIsValid(xid));
 
-	debug_elog4(DEBUG2, "Expand: received multi %u, xid %u",
-				multi, xid);
+	debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
+				multi, xid, mxstatus_to_string(status));
 
-	nmembers = GetMultiXactIdMembers(multi, &members);
+	/*
+	 * Note: we don't allow for old multis here.  The reason is that the
+	 * only caller of this function does a check that the multixact is
+	 * no longer running.
+	 */
+	nmembers = GetMultiXactIdMembers(multi, &members, false);
 
 	if (nmembers < 0)
 	{
+		MultiXactMember		member;
+
 		/*
 		 * The MultiXactId is obsolete.  This can only happen if all the
 		 * MultiXactId members stop running between the caller checking and
@@ -317,7 +404,9 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
 		 * caller, but it would complicate the API and it's unlikely to happen
 		 * too often, so just deal with it by creating a singleton MultiXact.
 		 */
-		newMulti = CreateMultiXactId(1, &xid);
+		member.xid = xid;
+		member.status = status;
+		newMulti = CreateMultiXactId(1, &member);
 
 		debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
 					multi, newMulti);
@@ -325,12 +414,13 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
 	}
 
 	/*
-	 * If the TransactionId is already a member of the MultiXactId, just
-	 * return the existing MultiXactId.
+	 * If the TransactionId is already a member of the MultiXactId with the
+	 * same status, just return the existing MultiXactId.
 	 */
 	for (i = 0; i < nmembers; i++)
 	{
-		if (TransactionIdEquals(members[i], xid))
+		if (TransactionIdEquals(members[i].xid, xid) &&
+			(members[i].status == status))
 		{
 			debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
 						xid, multi);
@@ -340,21 +430,31 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
 	}
 
 	/*
-	 * Determine which of the members of the MultiXactId are still running,
-	 * and use them to create a new one.  (Removing dead members is just an
-	 * optimization, but a useful one.	Note we have the same race condition
-	 * here as above: j could be 0 at the end of the loop.)
+	 * Determine which of the members of the MultiXactId are still of interest.
+	 * This is any running transaction, and also any transaction that grabbed
+	 * something stronger than just a lock and was committed.  (An update that
+	 * aborted is of no interest here.)
+	 *
+	 * (Removing dead members is just an optimization, but a useful one.
+	 * Note we have the same race condition here as above: j could be 0 at the
+	 * end of the loop.)
 	 */
-	newMembers = (TransactionId *)
-		palloc(sizeof(TransactionId) * (nmembers + 1));
+	newMembers = (MultiXactMember *)
+		palloc(sizeof(MultiXactMember) * (nmembers + 1));
 
 	for (i = 0, j = 0; i < nmembers; i++)
 	{
-		if (TransactionIdIsInProgress(members[i]))
-			newMembers[j++] = members[i];
+		if (TransactionIdIsInProgress(members[i].xid) ||
+			((members[i].status > MultiXactStatusForUpdate) &&
+			 TransactionIdDidCommit(members[i].xid)))
+		{
+			newMembers[j].xid = members[i].xid;
+			newMembers[j++].status = members[i].status;
+		}
 	}
 
-	newMembers[j++] = xid;
+	newMembers[j].xid = xid;
+	newMembers[j++].status = status;
 	newMulti = CreateMultiXactId(j, newMembers);
 
 	pfree(members);
@@ -372,17 +472,24 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
  * We return true if at least one member of the given MultiXactId is still
  * running.  Note that a "false" result is certain not to change,
  * because it is not legal to add members to an existing MultiXactId.
+ *
+ * Caller is expected to have verified that the multixact does not come from
+ * a pg_upgraded share-locked tuple.
  */
 bool
 MultiXactIdIsRunning(MultiXactId multi)
 {
-	TransactionId *members;
+	MultiXactMember *members;
 	int			nmembers;
 	int			i;
 
 	debug_elog3(DEBUG2, "IsRunning %u?", multi);
 
-	nmembers = GetMultiXactIdMembers(multi, &members);
+	/*
+	 * "false" here means we assume our callers have checked that the given
+	 * multi cannot possibly come from a pg_upgraded database.
+	 */
+	nmembers = GetMultiXactIdMembers(multi, &members, false);
 
 	if (nmembers < 0)
 	{
@@ -391,13 +498,15 @@ MultiXactIdIsRunning(MultiXactId multi)
 	}
 
 	/*
-	 * Checking for myself is cheap compared to looking in shared memory, so
-	 * first do the equivalent of MultiXactIdIsCurrent().  This is not needed
-	 * for correctness, it's just a fast path.
+	 * Checking for myself is cheap compared to looking in shared memory;
+	 * return true if any live subtransaction of the current top-level
+	 * transaction is a member.
+	 *
+	 * This is not needed for correctness, it's just a fast path.
 	 */
 	for (i = 0; i < nmembers; i++)
 	{
-		if (TransactionIdIsCurrentTransactionId(members[i]))
+		if (TransactionIdIsCurrentTransactionId(members[i].xid))
 		{
 			debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
 			pfree(members);
@@ -412,10 +521,10 @@ MultiXactIdIsRunning(MultiXactId multi)
 	 */
 	for (i = 0; i < nmembers; i++)
 	{
-		if (TransactionIdIsInProgress(members[i]))
+		if (TransactionIdIsInProgress(members[i].xid))
 		{
 			debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
-						i, members[i]);
+						i, members[i].xid);
 			pfree(members);
 			return true;
 		}
@@ -428,55 +537,18 @@ MultiXactIdIsRunning(MultiXactId multi)
 	return false;
 }
 
-/*
- * MultiXactIdIsCurrent
- *		Returns true if the current transaction is a member of the MultiXactId.
- *
- * We return true if any live subtransaction of the current top-level
- * transaction is a member.  This is appropriate for the same reason that a
- * lock held by any such subtransaction is globally equivalent to a lock
- * held by the current subtransaction: no such lock could be released without
- * aborting this subtransaction, and hence releasing its locks.  So it's not
- * necessary to add the current subxact to the MultiXact separately.
- */
-bool
-MultiXactIdIsCurrent(MultiXactId multi)
-{
-	bool		result = false;
-	TransactionId *members;
-	int			nmembers;
-	int			i;
-
-	nmembers = GetMultiXactIdMembers(multi, &members);
-
-	if (nmembers < 0)
-		return false;
-
-	for (i = 0; i < nmembers; i++)
-	{
-		if (TransactionIdIsCurrentTransactionId(members[i]))
-		{
-			result = true;
-			break;
-		}
-	}
-
-	pfree(members);
-
-	return result;
-}
-
 /*
  * MultiXactIdSetOldestMember
  *		Save the oldest MultiXactId this transaction could be a member of.
  *
- * We set the OldestMemberMXactId for a given transaction the first time
- * it's going to acquire a shared lock.  We need to do this even if we end
- * up using a TransactionId instead of a MultiXactId, because there is a
- * chance that another transaction would add our XID to a MultiXactId.
+ * We set the OldestMemberMXactId for a given transaction the first time it's
+ * going to do some operation that might require a MultiXactId (tuple lock,
+ * update or delete).  We need to do this even if we end up using a
+ * TransactionId instead of a MultiXactId, because there is a chance that
+ * another transaction would add our XID to a MultiXactId.
  *
- * The value to set is the next-to-be-assigned MultiXactId, so this is meant
- * to be called just before acquiring a shared lock.
+ * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
+ * be called just before doing any such possibly-MultiXactId-able operation.
  */
 void
 MultiXactIdSetOldestMember(void)
@@ -568,81 +640,23 @@ MultiXactIdSetOldestVisible(void)
 }
 
 /*
- * MultiXactIdWait
- *		Sleep on a MultiXactId.
- *
- * We do this by sleeping on each member using XactLockTableWait.  Any
- * members that belong to the current backend are *not* waited for, however;
- * this would not merely be useless but would lead to Assert failure inside
- * XactLockTableWait.  By the time this returns, it is certain that all
- * transactions *of other backends* that were members of the MultiXactId
- * are dead (and no new ones can have been added, since it is not legal
- * to add members to an existing MultiXactId).
- *
- * But by the time we finish sleeping, someone else may have changed the Xmax
- * of the containing tuple, so the caller needs to iterate on us somehow.
+ * ReadNextMultiXactId
+ * 		Return the next MultiXactId to be assigned, but don't allocate it
  */
-void
-MultiXactIdWait(MultiXactId multi)
-{
-	TransactionId *members;
-	int			nmembers;
-
-	nmembers = GetMultiXactIdMembers(multi, &members);
-
-	if (nmembers >= 0)
-	{
-		int			i;
-
-		for (i = 0; i < nmembers; i++)
-		{
-			TransactionId member = members[i];
-
-			debug_elog4(DEBUG2, "MultiXactIdWait: waiting for %d (%u)",
-						i, member);
-			if (!TransactionIdIsCurrentTransactionId(member))
-				XactLockTableWait(member);
-		}
-
-		pfree(members);
-	}
-}
-
-/*
- * ConditionalMultiXactIdWait
- *		As above, but only lock if we can get the lock without blocking.
- */
-bool
-ConditionalMultiXactIdWait(MultiXactId multi)
+MultiXactId
+ReadNextMultiXactId(void)
 {
-	bool		result = true;
-	TransactionId *members;
-	int			nmembers;
-
-	nmembers = GetMultiXactIdMembers(multi, &members);
-
-	if (nmembers >= 0)
-	{
-		int			i;
+	MultiXactId		mxid;
 
-		for (i = 0; i < nmembers; i++)
-		{
-			TransactionId member = members[i];
-
-			debug_elog4(DEBUG2, "ConditionalMultiXactIdWait: trying %d (%u)",
-						i, member);
-			if (!TransactionIdIsCurrentTransactionId(member))
-			{
-				result = ConditionalXactLockTableWait(member);
-				if (!result)
-					break;
-			}
-		}
+	/* XXX we could presumably do this without a lock. */
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	mxid = MultiXactState->nextMXact;
+	LWLockRelease(MultiXactGenLock);
 
-		pfree(members);
-	}
+	if (mxid < FirstMultiXactId)
+		mxid = FirstMultiXactId;
 
-	return result;
+	return mxid;
 }
 
 /*
@@ -652,10 +666,10 @@ ConditionalMultiXactIdWait(MultiXactId multi)
  * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
  * given TransactionIds as members.  Returns the newly created MultiXactId.
  *
- * NB: the passed xids[] array will be sorted in-place.
+ * NB: the passed members[] array will be sorted in-place.
  */
 static MultiXactId
-CreateMultiXactId(int nxids, TransactionId *xids)
+CreateMultiXactId(int nmembers, MultiXactMember *members)
 {
 	MultiXactId multi;
 	MultiXactOffset offset;
@@ -663,10 +677,10 @@ CreateMultiXactId(int nxids, TransactionId *xids)
 	xl_multixact_create xlrec;
 
 	debug_elog3(DEBUG2, "Create: %s",
-				mxid_to_string(InvalidMultiXactId, nxids, xids));
+				mxid_to_string(InvalidMultiXactId, nmembers, members));
 
 	/*
-	 * See if the same set of XIDs already exists in our cache; if so, just
+	 * See if the same set of members already exists in our cache; if so, just
 	 * re-use that MultiXactId.  (Note: it might seem that looking in our
 	 * cache is insufficient, and we ought to search disk to see if a
 	 * duplicate definition already exists.  But since we only ever create
@@ -675,7 +689,7 @@ CreateMultiXactId(int nxids, TransactionId *xids)
 	 * corner cases where someone else added us to a MultiXact without our
 	 * knowledge, but it's not worth checking for.)
 	 */
-	multi = mXactCacheGetBySet(nxids, xids);
+	multi = mXactCacheGetBySet(nmembers, members);
 	if (MultiXactIdIsValid(multi))
 	{
 		debug_elog2(DEBUG2, "Create: in cache!");
@@ -687,7 +701,7 @@ CreateMultiXactId(int nxids, TransactionId *xids)
 	 * in the OFFSETs and MEMBERs files.  NB: this routine does
 	 * START_CRIT_SECTION().
 	 */
-	multi = GetNewMultiXactId(nxids, &offset);
+	multi = GetNewMultiXactId(nmembers, &offset);
 
 	/*
 	 * Make an XLOG entry describing the new MXID.
@@ -704,27 +718,34 @@ CreateMultiXactId(int nxids, TransactionId *xids)
 	 */
 	xlrec.mid = multi;
 	xlrec.moff = offset;
-	xlrec.nxids = nxids;
+	xlrec.nmembers = nmembers;
 
+	/*
+	 * XXX Note: there's a lot of padding space in MultiXactMember.  We could
+	 * find a more compact representation of this Xlog record -- perhaps all the
+	 * status flags in one XLogRecData, then all the xids in another one?  Not
+	 * clear that it's worth the trouble though.
+	 */
 	rdata[0].data = (char *) (&xlrec);
-	rdata[0].len = MinSizeOfMultiXactCreate;
+	rdata[0].len = SizeOfMultiXactCreate;
 	rdata[0].buffer = InvalidBuffer;
 	rdata[0].next = &(rdata[1]);
-	rdata[1].data = (char *) xids;
-	rdata[1].len = nxids * sizeof(TransactionId);
+
+	rdata[1].data = (char *) members;
+	rdata[1].len = nmembers * sizeof(MultiXactMember);
 	rdata[1].buffer = InvalidBuffer;
 	rdata[1].next = NULL;
 
 	(void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID, rdata);
 
 	/* Now enter the information into the OFFSETs and MEMBERs logs */
-	RecordNewMultiXact(multi, offset, nxids, xids);
+	RecordNewMultiXact(multi, offset, nmembers, members);
 
 	/* Done with critical section */
 	END_CRIT_SECTION();
 
 	/* Store the new MultiXactId in the local cache, too */
-	mXactCachePut(multi, nxids, xids);
+	mXactCachePut(multi, nmembers, members);
 
 	debug_elog2(DEBUG2, "Create: all done");
 
@@ -739,7 +760,7 @@ CreateMultiXactId(int nxids, TransactionId *xids)
  */
 static void
 RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
-				   int nxids, TransactionId *xids)
+				   int nmembers, MultiXactMember *members)
 {
 	int			pageno;
 	int			prev_pageno;
@@ -775,12 +796,21 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 
 	prev_pageno = -1;
 
-	for (i = 0; i < nxids; i++, offset++)
+	for (i = 0; i < nmembers; i++, offset++)
 	{
 		TransactionId *memberptr;
+		uint32	   *flagsptr;
+		uint32		flagsval;
+		int			bshift;
+		int			flagsoff;
+		int			memberoff;
+
+		Assert(members[i].status <= MultiXactStatusUpdate);
 
 		pageno = MXOffsetToMemberPage(offset);
-		entryno = MXOffsetToMemberEntry(offset);
+		memberoff = MXOffsetToMemberOffset(offset);
+		flagsoff = MXOffsetToFlagsOffset(offset);
+		bshift = MXOffsetToFlagsBitShift(offset);
 
 		if (pageno != prev_pageno)
 		{
@@ -789,10 +819,17 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 		}
 
 		memberptr = (TransactionId *)
-			MultiXactMemberCtl->shared->page_buffer[slotno];
-		memberptr += entryno;
+			(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+
+		*memberptr = members[i].xid;
+
+		flagsptr = (uint32 *)
+			(MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
 
-		*memberptr = xids[i];
+		flagsval = *flagsptr;
+		flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+		flagsval |= (members[i].status << bshift);
+		*flagsptr = flagsval;
 
 		MultiXactMemberCtl->shared->page_dirty[slotno] = true;
 	}
@@ -816,27 +853,115 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
  * caller must end the critical section after writing SLRU data.
  */
 static MultiXactId
-GetNewMultiXactId(int nxids, MultiXactOffset *offset)
+GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
 {
 	MultiXactId result;
 	MultiXactOffset nextOffset;
 
-	debug_elog3(DEBUG2, "GetNew: for %d xids", nxids);
+	debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
 
 	/* MultiXactIdSetOldestMember() must have been called already */
 	Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
 
+	/* safety check, we should never get this far in a HS slave */
+	if (RecoveryInProgress())
+		elog(ERROR, "cannot assign MultiXactIds during recovery");
+
 	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
 
 	/* Handle wraparound of the nextMXact counter */
 	if (MultiXactState->nextMXact < FirstMultiXactId)
 		MultiXactState->nextMXact = FirstMultiXactId;
 
-	/*
-	 * Assign the MXID, and make sure there is room for it in the file.
-	 */
+	/* Assign the MXID */
 	result = MultiXactState->nextMXact;
 
+	/*----------
+	 * Check to see if it's safe to assign another MultiXactId.  This protects
+	 * against catastrophic data loss due to multixact wraparound.  The basic
+	 * rules are:
+	 *
+	 * If we're past multiVacLimit, start trying to force autovacuum cycles.
+	 * If we're past multiWarnLimit, start issuing warnings.
+	 * If we're past multiStopLimit, refuse to create new MultiXactIds.
+	 *
+	 * Note these are pretty much the same protections in GetNewTransactionId.
+	 *----------
+	 */
+	if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
+	{
+		/*
+		 * For safety's sake, we release MultiXactGenLock while sending
+		 * signals, warnings, etc.  This is not so much because we care about
+		 * preserving concurrency in this situation, as to avoid any
+		 * possibility of deadlock while doing get_database_name(). First,
+		 * copy all the shared values we'll need in this path.
+		 */
+		MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
+		MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
+		MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
+		Oid			oldest_datoid = MultiXactState->oldestMultiXactDB;
+
+		LWLockRelease(MultiXactGenLock);
+
+		/*
+		 * To avoid swamping the postmaster with signals, we issue the autovac
+		 * request only once per 64K transaction starts.  This still gives
+		 * plenty of chances before we get into real trouble.
+		 */
+		if (IsUnderPostmaster && (result % 65536) == 0)
+			SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+		if (IsUnderPostmaster &&
+			!MultiXactIdPrecedes(result, multiStopLimit))
+		{
+			char	   *oldest_datname = get_database_name(oldest_datoid);
+
+			/* complain even if that DB has disappeared */
+			if (oldest_datname)
+				ereport(ERROR,
+						(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+						 errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"",
+								oldest_datname),
+						 errhint("Execute a database-wide VACUUM in that database.\n"
+								 "You might also need to commit or roll back old prepared transactions.")));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+						 errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u",
+								oldest_datoid),
+						 errhint("Execute a database-wide VACUUM in that database.\n"
+								 "You might also need to commit or roll back old prepared transactions.")));
+		}
+		else if (!MultiXactIdPrecedes(result, multiWarnLimit))
+		{
+			char	   *oldest_datname = get_database_name(oldest_datoid);
+
+			/* complain even if that DB has disappeared */
+			if (oldest_datname)
+				ereport(WARNING,
+						(errmsg("database \"%s\" must be vacuumed before %u more MultiXactIds are used",
+								oldest_datname,
+								multiWrapLimit - result),
+						 errhint("Execute a database-wide VACUUM in that database.\n"
+								 "You might also need to commit or roll back old prepared transactions.")));
+			else
+				ereport(WARNING,
+						(errmsg("database with OID %u must be vacuumed before %u more MultiXactIds are used",
+								oldest_datoid,
+								multiWrapLimit - result),
+						 errhint("Execute a database-wide VACUUM in that database.\n"
+								 "You might also need to commit or roll back old prepared transactions.")));
+		}
+
+		/* Re-acquire lock and start over */
+		LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+		result = MultiXactState->nextMXact;
+		if (result < FirstMultiXactId)
+			result = FirstMultiXactId;
+	}
+
+	/* Make sure there is room for the MXID in the file.  */
 	ExtendMultiXactOffset(result);
 
 	/*
@@ -848,12 +973,12 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
 	if (nextOffset == 0)
 	{
 		*offset = 1;
-		nxids++;				/* allocate member slot 0 too */
+		nmembers++;				/* allocate member slot 0 too */
 	}
 	else
 		*offset = nextOffset;
 
-	ExtendMultiXactMember(nextOffset, nxids);
+	ExtendMultiXactMember(nextOffset, nmembers);
 
 	/*
 	 * Critical section from here until caller has written the data into the
@@ -870,13 +995,14 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
 	 *
 	 * We don't care about MultiXactId wraparound here; it will be handled by
 	 * the next iteration.	But note that nextMXact may be InvalidMultiXactId
-	 * after this routine exits, so anyone else looking at the variable must
-	 * be prepared to deal with that.  Similarly, nextOffset may be zero, but
-	 * we won't use that as the actual start offset of the next multixact.
+	 * or the first value on a segment-beginning page after this routine exits,
+	 * so anyone else looking at the variable must be prepared to deal with
+	 * either case.  Similarly, nextOffset may be zero, but we won't use that
+	 * as the actual start offset of the next multixact.
 	 */
 	(MultiXactState->nextMXact)++;
 
-	MultiXactState->nextOffset += nxids;
+	MultiXactState->nextOffset += nmembers;
 
 	LWLockRelease(MultiXactGenLock);
 
@@ -886,14 +1012,23 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
 
 /*
  * GetMultiXactIdMembers
- *		Returns the set of TransactionIds that make up a MultiXactId
+ *		Returns the set of MultiXactMembers that make up a MultiXactId
+ *
+ * If the given MultiXactId is older than the value we know to be oldest, we
+ * return -1.  The caller is expected to allow that only in permissible cases,
+ * i.e. when the infomask lets it presuppose that the tuple had been
+ * share-locked before a pg_upgrade; this means that the HEAP_XMAX_LOCK_ONLY
+ * needs to be set, but HEAP_XMAX_KEYSHR_LOCK and HEAP_XMAX_EXCL_LOCK are not
+ * set.
  *
- * We return -1 if the MultiXactId is too old to possibly have any members
- * still running; in that case we have not actually looked them up, and
- * *xids is not set.
+ * Other border conditions, such as trying to read a value that's larger than
+ * the value currently known as the next to assign, raise an error.  Previously
+ * these also returned -1, but since this can lead to the wrong visibility
+ * results, it is dangerous to do that.
  */
 int
-GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
+GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
+					  bool allow_old)
 {
 	int			pageno;
 	int			prev_pageno;
@@ -904,21 +1039,22 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
 	int			length;
 	int			truelength;
 	int			i;
+	MultiXactId oldestMXact;
 	MultiXactId nextMXact;
 	MultiXactId tmpMXact;
 	MultiXactOffset nextOffset;
-	TransactionId *ptr;
+	MultiXactMember *ptr;
 
 	debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
 
 	Assert(MultiXactIdIsValid(multi));
 
 	/* See if the MultiXactId is in the local cache */
-	length = mXactCacheGetById(multi, xids);
+	length = mXactCacheGetById(multi, members);
 	if (length >= 0)
 	{
 		debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
-					mxid_to_string(multi, length, *xids));
+					mxid_to_string(multi, length, *members));
 		return length;
 	}
 
@@ -928,43 +1064,48 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
 	/*
 	 * We check known limits on MultiXact before resorting to the SLRU area.
 	 *
-	 * An ID older than our OldestVisibleMXactId[] entry can't possibly still
-	 * be running, and we'd run the risk of trying to read already-truncated
-	 * SLRU data if we did try to examine it.
+	 * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
+	 * useful; it should have already been frozen by vacuum.  We've truncated
+	 * the on-disk structures anyway.  Returning the wrong values could lead to
+	 * an incorrect visibility result.  However, to support pg_upgrade we need
+	 * to allow an empty set to be returned regardless, if the caller is
+	 * willing to accept it; the caller is expected to check that it's an
+	 * allowed condition (such as ensuring that the infomask bits set on the
+	 * tuple are consistent with the pg_upgrade scenario).  If the caller is
+	 * expecting this to be called only on recently created multis, then we
+	 * raise an error.
 	 *
 	 * Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is
-	 * seen, it implies undetected ID wraparound has occurred.	We just
-	 * silently assume that such an ID is no longer running.
+	 * seen, it implies undetected ID wraparound has occurred.	This raises
+	 * a hard error.
 	 *
 	 * Shared lock is enough here since we aren't modifying any global state.
-	 * Also, we can examine our own OldestVisibleMXactId without the lock,
-	 * since no one else is allowed to change it.
-	 */
-	if (MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId]))
-	{
-		debug_elog2(DEBUG2, "GetMembers: it's too old");
-		*xids = NULL;
-		return -1;
-	}
-
-	/*
-	 * Acquire the shared lock just long enough to grab the current counter
-	 * values.	We may need both nextMXact and nextOffset; see below.
+	 * Acquire it just long enough to grab the current counter values.	We may
+	 * need both nextMXact and nextOffset; see below.
 	 */
 	LWLockAcquire(MultiXactGenLock, LW_SHARED);
 
+	oldestMXact = MultiXactState->oldestMultiXactId;
 	nextMXact = MultiXactState->nextMXact;
 	nextOffset = MultiXactState->nextOffset;
 
 	LWLockRelease(MultiXactGenLock);
 
-	if (!MultiXactIdPrecedes(multi, nextMXact))
+	if (MultiXactIdPrecedes(multi, oldestMXact))
 	{
-		debug_elog2(DEBUG2, "GetMembers: it's too new!");
-		*xids = NULL;
+		ereport(allow_old ? DEBUG1 : ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
+						multi)));
 		return -1;
 	}
 
+	if (!MultiXactIdPrecedes(multi, nextMXact))
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
+						multi)));
+
 	/*
 	 * Find out the offset at which we need to start reading MultiXactMembers
 	 * and the number of members in the multixact.	We determine the latter as
@@ -1055,8 +1196,8 @@ retry:
 
 	LWLockRelease(MultiXactOffsetControlLock);
 
-	ptr = (TransactionId *) palloc(length * sizeof(TransactionId));
-	*xids = ptr;
+	ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
+	*members = ptr;
 
 	/* Now get the members themselves. */
 	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
@@ -1066,9 +1207,13 @@ retry:
 	for (i = 0; i < length; i++, offset++)
 	{
 		TransactionId *xactptr;
+		uint32	   *flagsptr;
+		int			flagsoff;
+		int			bshift;
+		int			memberoff;
 
 		pageno = MXOffsetToMemberPage(offset);
-		entryno = MXOffsetToMemberEntry(offset);
+		memberoff = MXOffsetToMemberOffset(offset);
 
 		if (pageno != prev_pageno)
 		{
@@ -1077,8 +1222,7 @@ retry:
 		}
 
 		xactptr = (TransactionId *)
-			MultiXactMemberCtl->shared->page_buffer[slotno];
-		xactptr += entryno;
+			(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
 
 		if (!TransactionIdIsValid(*xactptr))
 		{
@@ -1087,7 +1231,13 @@ retry:
 			continue;
 		}
 
-		ptr[truelength++] = *xactptr;
+		flagsoff = MXOffsetToFlagsOffset(offset);
+		bshift = MXOffsetToFlagsBitShift(offset);
+		flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+
+		ptr[truelength].xid = *xactptr;
+		ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
+		truelength++;
 	}
 
 	LWLockRelease(MultiXactMemberControlLock);
@@ -1102,6 +1252,30 @@ retry:
 	return truelength;
 }
 
+/*
+ * mxactMemberComparator
+ *		qsort comparison function for MultiXactMember
+ *
+ * We can't use wraparound comparison for XIDs because that does not respect
+ * the triangle inequality!  Any old sort order will do.
+ */
+static int
+mxactMemberComparator(const void *arg1, const void *arg2)
+{
+	MultiXactMember member1 = *(const MultiXactMember *) arg1;
+	MultiXactMember member2 = *(const MultiXactMember *) arg2;
+
+	if (member1.xid > member2.xid)
+		return 1;
+	if (member1.xid < member2.xid)
+		return -1;
+	if (member1.status > member2.status)
+		return 1;
+	if (member1.status < member2.status)
+		return -1;
+	return 0;
+}
+
 /*
  * mXactCacheGetBySet
  *		returns a MultiXactId from the cache based on the set of
@@ -1113,26 +1287,29 @@ retry:
  * for the majority of tuples, thus keeping MultiXactId usage low (saving
  * both I/O and wraparound issues).
  *
- * NB: the passed xids[] array will be sorted in-place.
+ * NB: the passed members array will be sorted in-place.
  */
 static MultiXactId
-mXactCacheGetBySet(int nxids, TransactionId *xids)
+mXactCacheGetBySet(int nmembers, MultiXactMember *members)
 {
 	mXactCacheEnt *entry;
 
 	debug_elog3(DEBUG2, "CacheGet: looking for %s",
-				mxid_to_string(InvalidMultiXactId, nxids, xids));
+				mxid_to_string(InvalidMultiXactId, nmembers, members));
 
 	/* sort the array so comparison is easy */
-	qsort(xids, nxids, sizeof(TransactionId), xidComparator);
+	qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
 
 	for (entry = MXactCache; entry != NULL; entry = entry->next)
 	{
-		if (entry->nxids != nxids)
+		if (entry->nmembers != nmembers)
 			continue;
 
-		/* We assume the cache entries are sorted */
-		if (memcmp(xids, entry->xids, nxids * sizeof(TransactionId)) == 0)
+		/*
+		 * We assume the cache entries are sorted, and that the unused bits in
+		 * "status" are zeroed.
+		 */
+		if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
 		{
 			debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
 			return entry->multi;
@@ -1145,14 +1322,14 @@ mXactCacheGetBySet(int nxids, TransactionId *xids)
 
 /*
  * mXactCacheGetById
- *		returns the composing TransactionId set from the cache for a
+ *		returns the composing MultiXactMember set from the cache for a
  *		given MultiXactId, if present.
  *
  * If successful, *xids is set to the address of a palloc'd copy of the
- * TransactionId set.  Return value is number of members, or -1 on failure.
+ * MultiXactMember set.  Return value is number of members, or -1 on failure.
  */
 static int
-mXactCacheGetById(MultiXactId multi, TransactionId **xids)
+mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
 {
 	mXactCacheEnt *entry;
 
@@ -1162,18 +1339,18 @@ mXactCacheGetById(MultiXactId multi, TransactionId **xids)
 	{
 		if (entry->multi == multi)
 		{
-			TransactionId *ptr;
+			MultiXactMember *ptr;
 			Size		size;
 
-			size = sizeof(TransactionId) * entry->nxids;
-			ptr = (TransactionId *) palloc(size);
-			*xids = ptr;
+			size = sizeof(MultiXactMember) * entry->nmembers;
+			ptr = (MultiXactMember *) palloc(size);
+			*members = ptr;
 
-			memcpy(ptr, entry->xids, size);
+			memcpy(ptr, entry->members, size);
 
 			debug_elog3(DEBUG2, "CacheGet: found %s",
-						mxid_to_string(multi, entry->nxids, entry->xids));
-			return entry->nxids;
+						mxid_to_string(multi, entry->nmembers, entry->members));
+			return entry->nmembers;
 		}
 	}
 
@@ -1186,12 +1363,12 @@ mXactCacheGetById(MultiXactId multi, TransactionId **xids)
  *		Add a new MultiXactId and its composing set into the local cache.
  */
 static void
-mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids)
+mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
 {
 	mXactCacheEnt *entry;
 
 	debug_elog3(DEBUG2, "CachePut: storing %s",
-				mxid_to_string(multi, nxids, xids));
+				mxid_to_string(multi, nmembers, members));
 
 	if (MXactContext == NULL)
 	{
@@ -1206,36 +1383,67 @@ mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids)
 
 	entry = (mXactCacheEnt *)
 		MemoryContextAlloc(MXactContext,
-						   offsetof(mXactCacheEnt, xids) +
-						   nxids * sizeof(TransactionId));
+						   offsetof(mXactCacheEnt, members) +
+						   nmembers * sizeof(MultiXactMember));
 
 	entry->multi = multi;
-	entry->nxids = nxids;
-	memcpy(entry->xids, xids, nxids * sizeof(TransactionId));
+	entry->nmembers = nmembers;
+	memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
 
 	/* mXactCacheGetBySet assumes the entries are sorted, so sort them */
-	qsort(entry->xids, nxids, sizeof(TransactionId), xidComparator);
+	qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
 
 	entry->next = MXactCache;
 	MXactCache = entry;
 }
 
-#ifdef MULTIXACT_DEBUG
 static char *
-mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids)
+mxstatus_to_string(MultiXactStatus status)
+{
+	switch (status)
+	{
+		case MultiXactStatusForKeyShare:
+			return "keysh";
+		case MultiXactStatusForShare:
+			return "sh";
+		case MultiXactStatusForNoKeyUpdate:
+			return "fornokeyupd";
+		case MultiXactStatusForUpdate:
+			return "forupd";
+		case MultiXactStatusNoKeyUpdate:
+			return "nokeyupd";
+		case MultiXactStatusUpdate:
+			return "upd";
+		default:
+			elog(ERROR, "unrecognized multixact status %d", status);
+			return "";
+	}
+}
+
+char *
+mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
 {
-	char	   *str = palloc(15 * (nxids + 1) + 4);
+	static char	   *str = NULL;
+	StringInfoData	buf;
 	int			i;
 
-	snprintf(str, 47, "%u %d[%u", multi, nxids, xids[0]);
+	if (str != NULL)
+		pfree(str);
 
-	for (i = 1; i < nxids; i++)
-		snprintf(str + strlen(str), 17, ", %u", xids[i]);
+	initStringInfo(&buf);
 
-	strcat(str, "]");
+	appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
+					 mxstatus_to_string(members[0].status));
+
+	for (i = 1; i < nmembers; i++)
+		appendStringInfo(&buf, ", %u (%s)", members[i].xid,
+						 mxstatus_to_string(members[i].status));
+
+	appendStringInfoChar(&buf, ']');
+	str = MemoryContextStrdup(TopMemoryContext, buf.data);
+	pfree(buf.data);
 	return str;
 }
-#endif
 
 /*
  * AtEOXact_MultiXact
@@ -1512,8 +1720,9 @@ ZeroMultiXactMemberPage(int pageno, bool writeXlog)
  * This must be called ONCE during postmaster or standalone-backend startup.
  *
  * StartupXLOG has already established nextMXact/nextOffset by calling
- * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact.	Note that we
- * may already have replayed WAL data into the SLRU files.
+ * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
+ * info from pg_control and/or MultiXactAdvanceOldest.  Note that we may
+ * already have replayed WAL data into the SLRU files.
  *
  * We don't need any locks here, really; the SLRU locks are taken
  * only because slru.c expects to be called with locks held.
@@ -1525,6 +1734,7 @@ StartupMultiXact(void)
 	MultiXactOffset offset = MultiXactState->nextOffset;
 	int			pageno;
 	int			entryno;
+	int			flagsoff;
 
 	/* Clean up offsets state */
 	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
@@ -1569,28 +1779,30 @@ StartupMultiXact(void)
 	 * Zero out the remainder of the current members page.	See notes in
 	 * TrimCLOG() for motivation.
 	 */
-	entryno = MXOffsetToMemberEntry(offset);
-	if (entryno != 0)
+	flagsoff = MXOffsetToFlagsOffset(offset);
+	if (flagsoff != 0)
 	{
 		int			slotno;
 		TransactionId *xidptr;
+		int			memberoff;
 
+		memberoff = MXOffsetToMemberOffset(offset);
 		slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
-		xidptr = (TransactionId *) MultiXactMemberCtl->shared->page_buffer[slotno];
-		xidptr += entryno;
+		xidptr = (TransactionId *)
+			(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
 
-		MemSet(xidptr, 0, BLCKSZ - (entryno * sizeof(TransactionId)));
+		MemSet(xidptr, 0, BLCKSZ - memberoff);
+
+		/*
+		 * Note: we don't need to zero out the flag bits in the remaining
+		 * members of the current group, because they are always reset before
+		 * writing.
+		 */
 
 		MultiXactMemberCtl->shared->page_dirty[slotno] = true;
 	}
 
 	LWLockRelease(MultiXactMemberControlLock);
-
-	/*
-	 * Initialize lastTruncationPoint to invalid, ensuring that the first
-	 * checkpoint will try to do truncation.
-	 */
-	MultiXactState->lastTruncationPoint = InvalidMultiXactId;
 }
 
 /*
@@ -1607,22 +1819,25 @@ ShutdownMultiXact(void)
 }
 
 /*
- * Get the next MultiXactId and offset to save in a checkpoint record
+ * Get the MultiXact data to save in a checkpoint record
  */
 void
 MultiXactGetCheckptMulti(bool is_shutdown,
 						 MultiXactId *nextMulti,
-						 MultiXactOffset *nextMultiOffset)
+						 MultiXactOffset *nextMultiOffset,
+						 MultiXactId *oldestMulti,
+						 Oid *oldestMultiDB)
 {
 	LWLockAcquire(MultiXactGenLock, LW_SHARED);
-
 	*nextMulti = MultiXactState->nextMXact;
 	*nextMultiOffset = MultiXactState->nextOffset;
-
+	*oldestMulti = MultiXactState->oldestMultiXactId;
+	*oldestMultiDB = MultiXactState->oldestMultiXactDB;
 	LWLockRelease(MultiXactGenLock);
 
-	debug_elog4(DEBUG2, "MultiXact: checkpoint is nextMulti %u, nextOffset %u",
-				*nextMulti, *nextMultiOffset);
+	debug_elog6(DEBUG2,
+				"MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
+				*nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
 }
 
 /*
@@ -1637,17 +1852,6 @@ CheckPointMultiXact(void)
 	SimpleLruFlush(MultiXactOffsetCtl, true);
 	SimpleLruFlush(MultiXactMemberCtl, true);
 
-	/*
-	 * Truncate the SLRU files.  This could be done at any time, but
-	 * checkpoint seems a reasonable place for it.	There is one exception: if
-	 * we are called during xlog recovery, then shared->latest_page_number
-	 * isn't valid (because StartupMultiXact hasn't been called yet) and so
-	 * SimpleLruTruncate would get confused.  It seems best not to risk
-	 * removing any data during recovery anyway, so don't truncate.
-	 */
-	if (!RecoveryInProgress())
-		TruncateMultiXact();
-
 	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
 }
 
@@ -1671,9 +1875,129 @@ MultiXactSetNextMXact(MultiXactId nextMulti,
 	LWLockRelease(MultiXactGenLock);
 }
 
+/*
+ * Determine the last safe MultiXactId to allocate given the currently oldest
+ * datminmxid (ie, the oldest MultiXactId that might exist in any database
+ * of our cluster), and the OID of the (or a) database with that value.
+ */
+void
+SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
+{
+	MultiXactId	multiVacLimit;
+	MultiXactId	multiWarnLimit;
+	MultiXactId	multiStopLimit;
+	MultiXactId	multiWrapLimit;
+	MultiXactId	curMulti;
+
+	Assert(MultiXactIdIsValid(oldest_datminmxid));
+
+	/*
+	 * The place where we actually get into deep trouble is halfway around
+	 * from the oldest potentially-existing XID/multi.  (This calculation is
+	 * probably off by one or two counts for Xids, because the special XIDs
+	 * reduce the size of the loop a little bit.  But we throw in plenty of
+	 * slop below, so it doesn't matter.)
+	 */
+	multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
+	if (multiWrapLimit < FirstMultiXactId)
+		multiWrapLimit += FirstMultiXactId;
+
+	/*
+	 * We'll refuse to continue assigning MultiXactIds once we get within 100
+	 * multi of data loss.
+	 */
+	multiStopLimit = multiWrapLimit - 100;
+	if (multiStopLimit < FirstMultiXactId)
+		multiStopLimit -= FirstMultiXactId;
+
+	/*
+	 * We'll start complaining loudly when we get within 10M multis of the stop
+	 * point.	This is kind of arbitrary, but if you let your gas gauge get
+	 * down to 1% of full, would you be looking for the next gas station?  We
+	 * need to be fairly liberal about this number because there are lots of
+	 * scenarios where most transactions are done by automatic clients that
+	 * won't pay attention to warnings. (No, we're not gonna make this
+	 * configurable.  If you know enough to configure it, you know enough to
+	 * not get in this kind of trouble in the first place.)
+	 */
+	multiWarnLimit = multiStopLimit - 10000000;
+	if (multiWarnLimit < FirstMultiXactId)
+		multiWarnLimit -= FirstMultiXactId;
+
+	/*
+	 * We'll start trying to force autovacuums when oldest_datminmxid gets
+	 * to be more than 200 million transactions old.
+	 */
+	multiVacLimit = oldest_datminmxid + 200000000;
+	if (multiVacLimit < FirstMultiXactId)
+		multiVacLimit += FirstMultiXactId;
+
+	/* Grab lock for just long enough to set the new limit values */
+	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+	MultiXactState->oldestMultiXactId = oldest_datminmxid;
+	MultiXactState->oldestMultiXactDB = oldest_datoid;
+	MultiXactState->multiVacLimit = multiVacLimit;
+	MultiXactState->multiWarnLimit = multiWarnLimit;
+	MultiXactState->multiStopLimit = multiStopLimit;
+	MultiXactState->multiWrapLimit = multiWrapLimit;
+	curMulti = MultiXactState->nextMXact;
+	LWLockRelease(MultiXactGenLock);
+
+	/* Log the info */
+	ereport(DEBUG1,
+			(errmsg("MultiXactId wrap limit is %u, limited by database with OID %u",
+					multiWrapLimit, oldest_datoid)));
+
+	/*
+	 * If past the autovacuum force point, immediately signal an autovac
+	 * request.  The reason for this is that autovac only processes one
+	 * database per invocation.  Once it's finished cleaning up the oldest
+	 * database, it'll call here, and we'll signal the postmaster to start
+	 * another iteration immediately if there are still any old databases.
+	 */
+	if (MultiXactIdPrecedes(multiVacLimit, curMulti) &&
+		IsUnderPostmaster && !InRecovery)
+		SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+	/* Give an immediate warning if past the wrap warn point */
+	if (MultiXactIdPrecedes(multiWarnLimit, curMulti) && !InRecovery)
+	{
+		char	   *oldest_datname;
+
+		/*
+		 * We can be called when not inside a transaction, for example during
+		 * StartupXLOG().  In such a case we cannot do database access, so we
+		 * must just report the oldest DB's OID.
+		 *
+		 * Note: it's also possible that get_database_name fails and returns
+		 * NULL, for example because the database just got dropped.  We'll
+		 * still warn, even though the warning might now be unnecessary.
+		 */
+		if (IsTransactionState())
+			oldest_datname = get_database_name(oldest_datoid);
+		else
+			oldest_datname = NULL;
+
+		if (oldest_datname)
+			ereport(WARNING,
+					(errmsg("database \"%s\" must be vacuumed before %u more MultiXactId are used",
+							oldest_datname,
+							multiWrapLimit - curMulti),
+					 errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+							 "You might also need to commit or roll back old prepared transactions.")));
+		else
+			ereport(WARNING,
+					(errmsg("database with OID %u must be vacuumed before %u more MultiXactId are used",
+							oldest_datoid,
+							multiWrapLimit - curMulti),
+					 errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+							 "You might also need to commit or roll back old prepared transactions.")));
+	}
+}
+
 /*
  * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
- * and similarly nextOffset is at least minMultiOffset
+ * and similarly nextOffset is at least minMultiOffset.
  *
  * This is used when we can determine minimum safe values from an XLog
  * record (either an on-line checkpoint or an mxact creation log entry).
@@ -1699,6 +2023,17 @@ MultiXactAdvanceNextMXact(MultiXactId minMulti,
 	LWLockRelease(MultiXactGenLock);
 }
 
+/*
+ * Update our oldestMultiXactId value, but only if it's more recent than
+ * what we had.
+ */
+void
+MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
+{
+	if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
+		SetMultiXactIdLimit(oldestMulti, oldestMultiDB);
+}
+
 /*
  * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
  *
@@ -1748,13 +2083,16 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
 	 */
 	while (nmembers > 0)
 	{
-		int			entryno;
+		int			flagsoff;
+		int			flagsbit;
+		int			difference;
 
 		/*
 		 * Only zero when at first entry of a page.
 		 */
-		entryno = MXOffsetToMemberEntry(offset);
-		if (entryno == 0)
+		flagsoff = MXOffsetToFlagsOffset(offset);
+		flagsbit = MXOffsetToFlagsBitShift(offset);
+		if (flagsoff == 0 && flagsbit == 0)
 		{
 			int			pageno;
 
@@ -1769,33 +2107,32 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
 		}
 
 		/* Advance to next page (OK if nmembers goes negative) */
-		offset += (MULTIXACT_MEMBERS_PER_PAGE - entryno);
-		nmembers -= (MULTIXACT_MEMBERS_PER_PAGE - entryno);
+		difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
+		offset += difference;
+		nmembers -= difference;
 	}
 }
 
 /*
- * Remove all MultiXactOffset and MultiXactMember segments before the oldest
- * ones still of interest.
+ * GetOldestMultiXactId
  *
- * This is called only during checkpoints.	We assume no more than one
- * backend does this at a time.
+ * Return the oldest MultiXactId that's still possibly still seen as live by
+ * any running transaction.  Older ones might still exist on disk, but they no
+ * longer have any running member transaction.
  *
- * XXX do we have any issues with needing to checkpoint here?
+ * It's not safe to truncate MultiXact SLRU segments on the value returned by
+ * this function; however, it can be used by a full-table vacuum to set the
+ * point at which it will be possible to truncate SLRU for that table.
  */
-static void
-TruncateMultiXact(void)
+MultiXactId
+GetOldestMultiXactId(void)
 {
-	MultiXactId nextMXact;
-	MultiXactOffset nextOffset;
-	MultiXactId oldestMXact;
-	MultiXactOffset oldestOffset;
-	int			cutoffPage;
-	int			i;
+	MultiXactId		oldestMXact;
+	MultiXactId		nextMXact;
+	int				i;
 
 	/*
-	 * First, compute where we can safely truncate.  Per notes above, this is
-	 * the oldest valid value among all the OldestMemberMXactId[] and
+	 * This is the oldest valid value among all the OldestMemberMXactId[] and
 	 * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
 	 */
 	LWLockAcquire(MultiXactGenLock, LW_SHARED);
@@ -1824,28 +2161,69 @@ TruncateMultiXact(void)
 			oldestMXact = thisoldest;
 	}
 
-	/* Save the current nextOffset too */
-	nextOffset = MultiXactState->nextOffset;
-
 	LWLockRelease(MultiXactGenLock);
 
-	debug_elog3(DEBUG2, "MultiXact: truncation point = %u", oldestMXact);
+	return oldestMXact;
+}
+
+typedef struct mxtruncinfo
+{
+	int		earliestExistingPage;
+} mxtruncinfo;
+
+/*
+ * SlruScanDirectory callback
+ * 		This callback determines the earliest existing page number.
+ */
+static bool
+SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data)
+{
+	mxtruncinfo		*trunc = (mxtruncinfo *) data;
+
+	if (trunc->earliestExistingPage == -1 ||
+		ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
+	{
+		trunc->earliestExistingPage = segpage;
+	}
+
+	return false;	/* keep going */
+}
+
+/*
+ * Remove all MultiXactOffset and MultiXactMember segments before the oldest
+ * ones still of interest.
+ *
+ * This is called by vacuum after it has successfully advanced a database's
+ * datminmxid value; the cutoff value we're passed is the minimum of all
+ * databases' datminmxid values.
+ */
+void
+TruncateMultiXact(MultiXactId oldestMXact)
+{
+	MultiXactOffset	oldestOffset;
+	mxtruncinfo		trunc;
+	MultiXactId		earliest;
 
 	/*
-	 * If we already truncated at this point, do nothing.  This saves time
-	 * when no MultiXacts are getting used, which is probably not uncommon.
+	 * Note we can't just plow ahead with the truncation; it's possible that
+	 * there are no segments to truncate, which is a problem because we are
+	 * going to attempt to read the offsets page to determine where to truncate
+	 * the members SLRU.  So we first scan the directory to determine the
+	 * earliest offsets page number that we can read without error.
 	 */
-	if (MultiXactState->lastTruncationPoint == oldestMXact)
+	trunc.earliestExistingPage = -1;
+	SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
+	earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
+
+	/* nothing to do */
+	if (MultiXactIdPrecedes(oldestMXact, earliest))
 		return;
 
 	/*
-	 * We need to determine where to truncate MultiXactMember.	If we found a
-	 * valid oldest MultiXactId, read its starting offset; otherwise we use
-	 * the nextOffset value we saved above.
+	 * First, compute the safe truncation point for MultiXactMember.
+	 * This is the starting offset of the multixact we were passed
+	 * as MultiXactOffset cutoff.
 	 */
-	if (oldestMXact == nextMXact)
-		oldestOffset = nextOffset;
-	else
 	{
 		int			pageno;
 		int			slotno;
@@ -1857,34 +2235,23 @@ TruncateMultiXact(void)
 		pageno = MultiXactIdToOffsetPage(oldestMXact);
 		entryno = MultiXactIdToOffsetEntry(oldestMXact);
 
-		slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, oldestMXact);
-		offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+		slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno,
+											oldestMXact);
+		offptr = (MultiXactOffset *)
+			MultiXactOffsetCtl->shared->page_buffer[slotno];
 		offptr += entryno;
 		oldestOffset = *offptr;
 
 		LWLockRelease(MultiXactOffsetControlLock);
 	}
 
-	/*
-	 * The cutoff point is the start of the segment containing oldestMXact. We
-	 * pass the *page* containing oldestMXact to SimpleLruTruncate.
-	 */
-	cutoffPage = MultiXactIdToOffsetPage(oldestMXact);
-
-	SimpleLruTruncate(MultiXactOffsetCtl, cutoffPage);
-
-	/*
-	 * Also truncate MultiXactMember at the previously determined offset.
-	 */
-	cutoffPage = MXOffsetToMemberPage(oldestOffset);
+	/* truncate MultiXactOffset */
+	SimpleLruTruncate(MultiXactOffsetCtl,
+					  MultiXactIdToOffsetPage(oldestMXact));
 
-	SimpleLruTruncate(MultiXactMemberCtl, cutoffPage);
-
-	/*
-	 * Set the last known truncation point.  We don't need a lock for this
-	 * since only one backend does checkpoints at a time.
-	 */
-	MultiXactState->lastTruncationPoint = oldestMXact;
+	/* truncate MultiXactMembers and we're done */
+	SimpleLruTruncate(MultiXactMemberCtl,
+					  MXOffsetToMemberPage(oldestOffset));
 }
 
 /*
@@ -1934,7 +2301,7 @@ MultiXactMemberPagePrecedes(int page1, int page2)
  * XXX do we need to do something special for InvalidMultiXactId?
  * (Doesn't look like it.)
  */
-static bool
+bool
 MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
 {
 	int32		diff = (int32) (multi1 - multi2);
@@ -1953,7 +2320,6 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
 	return (diff < 0);
 }
 
-
 /*
  * Write an xlog record reflecting the zeroing of either a MEMBERs or
  * OFFSETs page (info shows which)
@@ -2013,16 +2379,18 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
 	}
 	else if (info == XLOG_MULTIXACT_CREATE_ID)
 	{
-		xl_multixact_create *xlrec = (xl_multixact_create *) XLogRecGetData(record);
-		TransactionId *xids = xlrec->xids;
+		xl_multixact_create *xlrec =
+			(xl_multixact_create *) XLogRecGetData(record);
 		TransactionId max_xid;
 		int			i;
 
 		/* Store the data back into the SLRU files */
-		RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nxids, xids);
+		RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
+						   xlrec->members);
 
 		/* Make sure nextMXact/nextOffset are beyond what this record has */
-		MultiXactAdvanceNextMXact(xlrec->mid + 1, xlrec->moff + xlrec->nxids);
+		MultiXactAdvanceNextMXact(xlrec->mid + 1,
+								  xlrec->moff + xlrec->nmembers);
 
 		/*
 		 * Make sure nextXid is beyond any XID mentioned in the record. This
@@ -2030,10 +2398,10 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
 		 * evidence in the XLOG, but let's be safe.
 		 */
 		max_xid = record->xl_xid;
-		for (i = 0; i < xlrec->nxids; i++)
+		for (i = 0; i < xlrec->nmembers; i++)
 		{
-			if (TransactionIdPrecedes(max_xid, xids[i]))
-				max_xid = xids[i];
+			if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
+				max_xid = xlrec->members[i].xid;
 		}
 
 		/*
@@ -2053,3 +2421,72 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
 	else
 		elog(PANIC, "multixact_redo: unknown op code %u", info);
 }
+
+Datum
+pg_get_multixact_members(PG_FUNCTION_ARGS)
+{
+	typedef struct
+	{
+		MultiXactMember	*members;
+		int				nmembers;
+		int				iter;
+	} mxact;
+	MultiXactId		mxid = PG_GETARG_UINT32(0);
+	mxact		   *multi;
+	FuncCallContext *funccxt;
+
+	if (mxid < FirstMultiXactId)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid MultiXactId: %u", mxid)));
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcxt;
+		TupleDesc	tupdesc;
+
+		funccxt = SRF_FIRSTCALL_INIT();
+		oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx);
+
+		multi = palloc(sizeof(mxact));
+		/* no need to allow for old values here */
+		multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false);
+		multi->iter = 0;
+
+		tupdesc = CreateTemplateTupleDesc(2, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid",
+						   XIDOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "mode",
+						   TEXTOID, -1, 0);
+
+		funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc);
+		funccxt->user_fctx = multi;
+
+		MemoryContextSwitchTo(oldcxt);
+	}
+
+	funccxt = SRF_PERCALL_SETUP();
+	multi = (mxact *) funccxt->user_fctx;
+
+	while (multi->iter < multi->nmembers)
+	{
+		HeapTuple	tuple;
+		char	   *values[2];
+
+		values[0] = palloc(32);
+		sprintf(values[0], "%u", multi->members[multi->iter].xid);
+		values[1] = mxstatus_to_string(multi->members[multi->iter].status);
+
+		tuple = BuildTupleFromCStrings(funccxt->attinmeta, values);
+
+		multi->iter++;
+		pfree(values[0]);
+		SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple));
+	}
+
+	if (multi->nmembers > 0)
+		pfree(multi->members);
+	pfree(multi);
+
+	SRF_RETURN_DONE(funccxt);
+}
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index f041e4b2c0..64537d0128 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -75,6 +75,8 @@ GetNewTransactionId(bool isSubXact)
 	 * If we're past xidStopLimit, refuse to execute transactions, unless
 	 * we are running in a standalone backend (which gives an escape hatch
 	 * to the DBA who somehow got past the earlier defenses).
+	 *
+	 * Note that this coding also appears in GetNewMultiXactId.
 	 *----------
 	 */
 	if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidVacLimit))
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index d316c97926..cf2f6e70cf 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3899,6 +3899,8 @@ BootStrapXLOG(void)
 	checkPoint.nextMultiOffset = 0;
 	checkPoint.oldestXid = FirstNormalTransactionId;
 	checkPoint.oldestXidDB = TemplateDbOid;
+	checkPoint.oldestMulti = FirstMultiXactId;
+	checkPoint.oldestMultiDB = TemplateDbOid;
 	checkPoint.time = (pg_time_t) time(NULL);
 	checkPoint.oldestActiveXid = InvalidTransactionId;
 
@@ -3907,6 +3909,7 @@ BootStrapXLOG(void)
 	ShmemVariableCache->oidCount = 0;
 	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
 	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
 
 	/* Set up the XLOG page header */
 	page->xlp_magic = XLOG_PAGE_MAGIC;
@@ -4979,6 +4982,9 @@ StartupXLOG(void)
 	ereport(DEBUG1,
 			(errmsg("oldest unfrozen transaction ID: %u, in database %u",
 					checkPoint.oldestXid, checkPoint.oldestXidDB)));
+	ereport(DEBUG1,
+			(errmsg("oldest MultiXactId: %u, in database %u",
+					checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
 	if (!TransactionIdIsNormal(checkPoint.nextXid))
 		ereport(PANIC,
 				(errmsg("invalid next transaction ID")));
@@ -4989,6 +4995,7 @@ StartupXLOG(void)
 	ShmemVariableCache->oidCount = 0;
 	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
 	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
 	XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
 	XLogCtl->ckptXid = checkPoint.nextXid;
 
@@ -6724,7 +6731,9 @@ CreateCheckPoint(int flags)
 
 	MultiXactGetCheckptMulti(shutdown,
 							 &checkPoint.nextMulti,
-							 &checkPoint.nextMultiOffset);
+							 &checkPoint.nextMultiOffset,
+							 &checkPoint.oldestMulti,
+							 &checkPoint.oldestMultiDB);
 
 	/*
 	 * Having constructed the checkpoint record, ensure all shmem disk buffers
@@ -7479,6 +7488,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 		MultiXactSetNextMXact(checkPoint.nextMulti,
 							  checkPoint.nextMultiOffset);
 		SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+		SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
 
 		/*
 		 * If we see a shutdown checkpoint while waiting for an end-of-backup
@@ -7577,6 +7587,8 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 								  checkPoint.oldestXid))
 			SetTransactionIdLimit(checkPoint.oldestXid,
 								  checkPoint.oldestXidDB);
+		MultiXactAdvanceOldest(checkPoint.oldestMulti,
+							   checkPoint.oldestMultiDB);
 
 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
 		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 263205855b..db51e0b608 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -30,6 +30,7 @@
 #include "postgres.h"
 
 #include "access/htup_details.h"
+#include "access/multixact.h"
 #include "access/sysattr.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -779,6 +780,7 @@ InsertPgClassTuple(Relation pg_class_desc,
 	values[Anum_pg_class_relhastriggers - 1] = BoolGetDatum(rd_rel->relhastriggers);
 	values[Anum_pg_class_relhassubclass - 1] = BoolGetDatum(rd_rel->relhassubclass);
 	values[Anum_pg_class_relfrozenxid - 1] = TransactionIdGetDatum(rd_rel->relfrozenxid);
+	values[Anum_pg_class_relminmxid - 1] = MultiXactIdGetDatum(rd_rel->relminmxid);
 	if (relacl != (Datum) 0)
 		values[Anum_pg_class_relacl - 1] = relacl;
 	else
@@ -854,7 +856,7 @@ AddNewRelationTuple(Relation pg_class_desc,
 			break;
 	}
 
-	/* Initialize relfrozenxid */
+	/* Initialize relfrozenxid and relminmxid */
 	if (relkind == RELKIND_RELATION ||
 		relkind == RELKIND_TOASTVALUE)
 	{
@@ -864,6 +866,15 @@ AddNewRelationTuple(Relation pg_class_desc,
 		 * that will do.
 		 */
 		new_rel_reltup->relfrozenxid = RecentXmin;
+		/*
+		 * Similarly, initialize the minimum Multixact to the first value that
+		 * could possibly be stored in tuples in the table.  Running
+		 * transactions could reuse values from their local cache, so we are
+		 * careful to consider all currently running multis.
+		 *
+		 * XXX this could be refined further, but is it worth the hassle?
+		 */
+		new_rel_reltup->relminmxid = GetOldestMultiXactId();
 	}
 	else
 	{
@@ -874,6 +885,7 @@ AddNewRelationTuple(Relation pg_class_desc,
 		 * commands/sequence.c.)
 		 */
 		new_rel_reltup->relfrozenxid = InvalidTransactionId;
+		new_rel_reltup->relfrozenxid = InvalidMultiXactId;
 	}
 
 	new_rel_reltup->relowner = relowner;
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 5892e44667..9b339292e4 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -23,6 +23,7 @@
 
 #include <unistd.h>
 
+#include "access/multixact.h"
 #include "access/relscan.h"
 #include "access/sysattr.h"
 #include "access/transam.h"
@@ -2353,8 +2354,7 @@ IndexBuildHeapScan(Relation heapRelation,
 					 * As with INSERT_IN_PROGRESS case, this is unexpected
 					 * unless it's our own deletion or a system catalog.
 					 */
-					Assert(!(heapTuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
-					xwait = HeapTupleHeaderGetXmax(heapTuple->t_data);
+					xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
 					if (!TransactionIdIsCurrentTransactionId(xwait))
 					{
 						if (!is_system_catalog)
@@ -3184,7 +3184,8 @@ reindex_index(Oid indexId, bool skip_constraint_checks)
 		}
 
 		/* We'll build a new physical relation for the index */
-		RelationSetNewRelfilenode(iRel, InvalidTransactionId);
+		RelationSetNewRelfilenode(iRel, InvalidTransactionId,
+								  InvalidMultiXactId);
 
 		/* Initialize the index and rebuild */
 		/* Note: we do not need to re-establish pkey setting */
@@ -3364,7 +3365,7 @@ reindex_relation(Oid relid, int flags)
 
 	/* Ensure rd_indexattr is valid; see comments for RelationSetIndexList */
 	if (is_pg_class)
-		(void) RelationGetIndexAttrBitmap(rel);
+		(void) RelationGetIndexAttrBitmap(rel, false);
 
 	PG_TRY();
 	{
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 7a5eb42424..d7b17a5aba 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -16,6 +16,7 @@
 
 #include <math.h>
 
+#include "access/multixact.h"
 #include "access/transam.h"
 #include "access/tupconvert.h"
 #include "access/tuptoaster.h"
@@ -580,7 +581,8 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
 							totalrows,
 							visibilitymap_count(onerel),
 							hasindex,
-							InvalidTransactionId);
+							InvalidTransactionId,
+							InvalidMultiXactId);
 
 	/*
 	 * Same for indexes. Vacuum always scans all indexes, so if we're part of
@@ -600,7 +602,8 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
 								totalindexrows,
 								0,
 								false,
-								InvalidTransactionId);
+								InvalidTransactionId,
+								InvalidMultiXactId);
 		}
 	}
 
@@ -1193,7 +1196,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 					 * right.  (Note: this works out properly when the row was
 					 * both inserted and deleted in our xact.)
 					 */
-					if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(targtuple.t_data)))
+					if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple.t_data)))
 						deadrows += 1;
 					else
 						liverows += 1;
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 238781b6a7..c0cb2f6654 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -17,6 +17,7 @@
  */
 #include "postgres.h"
 
+#include "access/multixact.h"
 #include "access/relscan.h"
 #include "access/rewriteheap.h"
 #include "access/transam.h"
@@ -65,7 +66,8 @@ static void rebuild_relation(Relation OldHeap, Oid indexOid,
 				 int freeze_min_age, int freeze_table_age, bool verbose);
 static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 			   int freeze_min_age, int freeze_table_age, bool verbose,
-			   bool *pSwapToastByContent, TransactionId *pFreezeXid);
+			   bool *pSwapToastByContent, TransactionId *pFreezeXid,
+			   MultiXactId *pFreezeMulti);
 static List *get_tables_to_cluster(MemoryContext cluster_context);
 static void reform_and_rewrite_tuple(HeapTuple tuple,
 						 TupleDesc oldTupDesc, TupleDesc newTupDesc,
@@ -549,6 +551,7 @@ rebuild_relation(Relation OldHeap, Oid indexOid,
 	bool		is_system_catalog;
 	bool		swap_toast_by_content;
 	TransactionId frozenXid;
+	MultiXactId	frozenMulti;
 
 	/* Mark the correct index as clustered */
 	if (OidIsValid(indexOid))
@@ -566,14 +569,14 @@ rebuild_relation(Relation OldHeap, Oid indexOid,
 	/* Copy the heap data into the new table in the desired order */
 	copy_heap_data(OIDNewHeap, tableOid, indexOid,
 				   freeze_min_age, freeze_table_age, verbose,
-				   &swap_toast_by_content, &frozenXid);
+				   &swap_toast_by_content, &frozenXid, &frozenMulti);
 
 	/*
 	 * Swap the physical files of the target and transient tables, then
 	 * rebuild the target's indexes and throw away the transient table.
 	 */
 	finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
-					 swap_toast_by_content, false, frozenXid);
+					 swap_toast_by_content, false, frozenXid, frozenMulti);
 }
 
 
@@ -706,7 +709,8 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace)
 static void
 copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 			   int freeze_min_age, int freeze_table_age, bool verbose,
-			   bool *pSwapToastByContent, TransactionId *pFreezeXid)
+			   bool *pSwapToastByContent, TransactionId *pFreezeXid,
+			   MultiXactId *pFreezeMulti)
 {
 	Relation	NewHeap,
 				OldHeap,
@@ -722,6 +726,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 	bool		is_system_catalog;
 	TransactionId OldestXmin;
 	TransactionId FreezeXid;
+	MultiXactId	MultiXactFrzLimit;
 	RewriteState rwstate;
 	bool		use_sort;
 	Tuplesortstate *tuplesort;
@@ -822,7 +827,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 	 */
 	vacuum_set_xid_limits(freeze_min_age, freeze_table_age,
 						  OldHeap->rd_rel->relisshared,
-						  &OldestXmin, &FreezeXid, NULL);
+						  &OldestXmin, &FreezeXid, NULL, &MultiXactFrzLimit);
 
 	/*
 	 * FreezeXid will become the table's new relfrozenxid, and that mustn't go
@@ -831,14 +836,16 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 	if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
 		FreezeXid = OldHeap->rd_rel->relfrozenxid;
 
-	/* return selected value to caller */
+	/* return selected values to caller */
 	*pFreezeXid = FreezeXid;
+	*pFreezeMulti = MultiXactFrzLimit;
 
 	/* Remember if it's a system catalog */
 	is_system_catalog = IsSystemRelation(OldHeap);
 
 	/* Initialize the rewrite operation */
-	rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, use_wal);
+	rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid,
+								 MultiXactFrzLimit, use_wal);
 
 	/*
 	 * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
@@ -966,9 +973,8 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 				/*
 				 * Similar situation to INSERT_IN_PROGRESS case.
 				 */
-				Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
 				if (!is_system_catalog &&
-					!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple->t_data)))
+					!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
 					elog(WARNING, "concurrent delete in progress within table \"%s\"",
 						 RelationGetRelationName(OldHeap));
 				/* treat as recently dead */
@@ -1097,6 +1103,7 @@ static void
 swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
 					bool swap_toast_by_content,
 					TransactionId frozenXid,
+					MultiXactId frozenMulti,
 					Oid *mapped_tables)
 {
 	Relation	relRelation;
@@ -1204,11 +1211,13 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
 	 * and then fail to commit the pg_class update.
 	 */
 
-	/* set rel1's frozen Xid */
+	/* set rel1's frozen Xid and minimum MultiXid */
 	if (relform1->relkind != RELKIND_INDEX)
 	{
 		Assert(TransactionIdIsNormal(frozenXid));
 		relform1->relfrozenxid = frozenXid;
+		Assert(MultiXactIdIsValid(frozenMulti));
+		relform1->relminmxid = frozenMulti;
 	}
 
 	/* swap size statistics too, since new rel has freshly-updated stats */
@@ -1272,6 +1281,7 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
 									target_is_pg_class,
 									swap_toast_by_content,
 									frozenXid,
+									frozenMulti,
 									mapped_tables);
 			}
 			else
@@ -1361,6 +1371,7 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
 							target_is_pg_class,
 							swap_toast_by_content,
 							InvalidTransactionId,
+							InvalidMultiXactId,
 							mapped_tables);
 
 	/* Clean up. */
@@ -1398,7 +1409,8 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
 				 bool is_system_catalog,
 				 bool swap_toast_by_content,
 				 bool check_constraints,
-				 TransactionId frozenXid)
+				 TransactionId frozenXid,
+				 MultiXactId frozenMulti)
 {
 	ObjectAddress object;
 	Oid			mapped_tables[4];
@@ -1414,7 +1426,8 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
 	 */
 	swap_relation_files(OIDOldHeap, OIDNewHeap,
 						(OIDOldHeap == RelationRelationId),
-						swap_toast_by_content, frozenXid, mapped_tables);
+						swap_toast_by_content, frozenXid, frozenMulti,
+						mapped_tables);
 
 	/*
 	 * If it's a system catalog, queue an sinval message to flush all
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 4ad4b99758..5b06af24a6 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -80,6 +80,7 @@ static bool get_db_info(const char *name, LOCKMODE lockmode,
 			Oid *dbIdP, Oid *ownerIdP,
 			int *encodingP, bool *dbIsTemplateP, bool *dbAllowConnP,
 			Oid *dbLastSysOidP, TransactionId *dbFrozenXidP,
+			MultiXactId *dbMinMultiP,
 			Oid *dbTablespace, char **dbCollate, char **dbCtype);
 static bool have_createdb_privilege(void);
 static void remove_dbtablespaces(Oid db_id);
@@ -104,6 +105,7 @@ createdb(const CreatedbStmt *stmt)
 	bool		src_allowconn;
 	Oid			src_lastsysoid;
 	TransactionId src_frozenxid;
+	MultiXactId src_minmxid;
 	Oid			src_deftablespace;
 	volatile Oid dst_deftablespace;
 	Relation	pg_database_rel;
@@ -288,7 +290,7 @@ createdb(const CreatedbStmt *stmt)
 	if (!get_db_info(dbtemplate, ShareLock,
 					 &src_dboid, &src_owner, &src_encoding,
 					 &src_istemplate, &src_allowconn, &src_lastsysoid,
-					 &src_frozenxid, &src_deftablespace,
+					 &src_frozenxid, &src_minmxid, &src_deftablespace,
 					 &src_collate, &src_ctype))
 		ereport(ERROR,
 				(errcode(ERRCODE_UNDEFINED_DATABASE),
@@ -491,6 +493,7 @@ createdb(const CreatedbStmt *stmt)
 	new_record[Anum_pg_database_datconnlimit - 1] = Int32GetDatum(dbconnlimit);
 	new_record[Anum_pg_database_datlastsysoid - 1] = ObjectIdGetDatum(src_lastsysoid);
 	new_record[Anum_pg_database_datfrozenxid - 1] = TransactionIdGetDatum(src_frozenxid);
+	new_record[Anum_pg_database_datminmxid - 1] = TransactionIdGetDatum(src_minmxid);
 	new_record[Anum_pg_database_dattablespace - 1] = ObjectIdGetDatum(dst_deftablespace);
 
 	/*
@@ -786,7 +789,7 @@ dropdb(const char *dbname, bool missing_ok)
 	pgdbrel = heap_open(DatabaseRelationId, RowExclusiveLock);
 
 	if (!get_db_info(dbname, AccessExclusiveLock, &db_id, NULL, NULL,
-					 &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL))
+					 &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL, NULL))
 	{
 		if (!missing_ok)
 		{
@@ -945,7 +948,7 @@ RenameDatabase(const char *oldname, const char *newname)
 	rel = heap_open(DatabaseRelationId, RowExclusiveLock);
 
 	if (!get_db_info(oldname, AccessExclusiveLock, &db_id, NULL, NULL,
-					 NULL, NULL, NULL, NULL, NULL, NULL, NULL))
+					 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL))
 		ereport(ERROR,
 				(errcode(ERRCODE_UNDEFINED_DATABASE),
 				 errmsg("database \"%s\" does not exist", oldname)));
@@ -1046,7 +1049,7 @@ movedb(const char *dbname, const char *tblspcname)
 	pgdbrel = heap_open(DatabaseRelationId, RowExclusiveLock);
 
 	if (!get_db_info(dbname, AccessExclusiveLock, &db_id, NULL, NULL,
-					 NULL, NULL, NULL, NULL, &src_tblspcoid, NULL, NULL))
+					 NULL, NULL, NULL, NULL, NULL, &src_tblspcoid, NULL, NULL))
 		ereport(ERROR,
 				(errcode(ERRCODE_UNDEFINED_DATABASE),
 				 errmsg("database \"%s\" does not exist", dbname)));
@@ -1599,6 +1602,7 @@ get_db_info(const char *name, LOCKMODE lockmode,
 			Oid *dbIdP, Oid *ownerIdP,
 			int *encodingP, bool *dbIsTemplateP, bool *dbAllowConnP,
 			Oid *dbLastSysOidP, TransactionId *dbFrozenXidP,
+			MultiXactId *dbMinMultiP,
 			Oid *dbTablespace, char **dbCollate, char **dbCtype)
 {
 	bool		result = false;
@@ -1685,6 +1689,9 @@ get_db_info(const char *name, LOCKMODE lockmode,
 				/* limit of frozen XIDs */
 				if (dbFrozenXidP)
 					*dbFrozenXidP = dbform->datfrozenxid;
+				/* limit of frozen Multixacts */
+				if (dbMinMultiP)
+					*dbMinMultiP = dbform->datminmxid;
 				/* default tablespace for this database */
 				if (dbTablespace)
 					*dbTablespace = dbform->dattablespace;
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 1f2546d69c..de41c8a1c7 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -14,8 +14,9 @@
  */
 #include "postgres.h"
 
-#include "access/transam.h"
 #include "access/htup_details.h"
+#include "access/multixact.h"
+#include "access/transam.h"
 #include "access/xlogutils.h"
 #include "catalog/dependency.h"
 #include "catalog/namespace.h"
@@ -282,8 +283,10 @@ ResetSequence(Oid seq_relid)
 	/*
 	 * Create a new storage file for the sequence.	We want to keep the
 	 * sequence's relfrozenxid at 0, since it won't contain any unfrozen XIDs.
+	 * Same with relminmxid, since a sequence will never contain multixacts.
 	 */
-	RelationSetNewRelfilenode(seq_rel, InvalidTransactionId);
+	RelationSetNewRelfilenode(seq_rel, InvalidTransactionId,
+							  InvalidMultiXactId);
 
 	/*
 	 * Insert the modified tuple into the new storage file.
@@ -1110,7 +1113,8 @@ read_seq_tuple(SeqTable elm, Relation rel, Buffer *buf, HeapTuple seqtuple)
 	 * bit update, ie, don't bother to WAL-log it, since we can certainly do
 	 * this again if the update gets lost.
 	 */
-	if (HeapTupleHeaderGetXmax(seqtuple->t_data) != InvalidTransactionId)
+	Assert(!(seqtuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
+	if (HeapTupleHeaderGetRawXmax(seqtuple->t_data) != InvalidTransactionId)
 	{
 		HeapTupleHeaderSetXmax(seqtuple->t_data, InvalidTransactionId);
 		seqtuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED;
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index cad83117f9..6bc056bbc3 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -15,7 +15,9 @@
 #include "postgres.h"
 
 #include "access/genam.h"
+#include "access/heapam.h"
 #include "access/heapam_xlog.h"
+#include "access/multixact.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
 #include "access/sysattr.h"
@@ -1130,6 +1132,7 @@ ExecuteTruncate(TruncateStmt *stmt)
 		{
 			Oid			heap_relid;
 			Oid			toast_relid;
+			MultiXactId	minmulti;
 
 			/*
 			 * This effectively deletes all rows in the table, and may be done
@@ -1139,6 +1142,8 @@ ExecuteTruncate(TruncateStmt *stmt)
 			 */
 			CheckTableForSerializableConflictIn(rel);
 
+			minmulti = GetOldestMultiXactId();
+
 			/*
 			 * Need the full transaction-safe pushups.
 			 *
@@ -1146,7 +1151,7 @@ ExecuteTruncate(TruncateStmt *stmt)
 			 * as the relfilenode value. The old storage file is scheduled for
 			 * deletion at commit.
 			 */
-			RelationSetNewRelfilenode(rel, RecentXmin);
+			RelationSetNewRelfilenode(rel, RecentXmin, minmulti);
 			if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
 				heap_create_init_fork(rel);
 
@@ -1159,7 +1164,7 @@ ExecuteTruncate(TruncateStmt *stmt)
 			if (OidIsValid(toast_relid))
 			{
 				rel = relation_open(toast_relid, AccessExclusiveLock);
-				RelationSetNewRelfilenode(rel, RecentXmin);
+				RelationSetNewRelfilenode(rel, RecentXmin, minmulti);
 				if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
 					heap_create_init_fork(rel);
 				heap_close(rel, NoLock);
@@ -3516,7 +3521,8 @@ ATRewriteTables(List **wqueue, LOCKMODE lockmode)
 			 * interest in letting this code work on system catalogs.
 			 */
 			finish_heap_swap(tab->relid, OIDNewHeap,
-							 false, false, true, RecentXmin);
+							 false, false, true, RecentXmin,
+							 ReadNextMultiXactId());
 		}
 		else
 		{
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index a719cf24f4..f11a8ec5d4 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -73,6 +73,7 @@ static HeapTuple GetTupleForTrigger(EState *estate,
 				   EPQState *epqstate,
 				   ResultRelInfo *relinfo,
 				   ItemPointer tid,
+				   LockTupleMode lockmode,
 				   TupleTableSlot **newSlot);
 static bool TriggerEnabled(EState *estate, ResultRelInfo *relinfo,
 			   Trigger *trigger, TriggerEvent event,
@@ -2147,7 +2148,7 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
 	int			i;
 
 	trigtuple = GetTupleForTrigger(estate, epqstate, relinfo, tupleid,
-								   &newSlot);
+								   LockTupleExclusive, &newSlot);
 	if (trigtuple == NULL)
 		return false;
 
@@ -2201,7 +2202,8 @@ ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo,
 	if (trigdesc && trigdesc->trig_delete_after_row)
 	{
 		HeapTuple	trigtuple = GetTupleForTrigger(estate, NULL, relinfo,
-												   tupleid, NULL);
+												   tupleid, LockTupleExclusive,
+												   NULL);
 
 		AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_DELETE,
 							  true, trigtuple, NULL, NIL, NULL);
@@ -2332,10 +2334,24 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
 	TupleTableSlot *newSlot;
 	int			i;
 	Bitmapset  *modifiedCols;
+	Bitmapset  *keyCols;
+	LockTupleMode lockmode;
+
+	/*
+	 * Compute lock mode to use.  If columns that are part of the key have not
+	 * been modified, then we can use a weaker lock, allowing for better
+	 * concurrency.
+	 */
+	modifiedCols = GetModifiedColumns(relinfo, estate);
+	keyCols = RelationGetIndexAttrBitmap(relinfo->ri_RelationDesc, true);
+	if (bms_overlap(keyCols, modifiedCols))
+		lockmode = LockTupleExclusive;
+	else
+		lockmode = LockTupleNoKeyExclusive;
 
 	/* get a copy of the on-disk tuple we are planning to update */
 	trigtuple = GetTupleForTrigger(estate, epqstate, relinfo, tupleid,
-								   &newSlot);
+								   lockmode, &newSlot);
 	if (trigtuple == NULL)
 		return NULL;			/* cancel the update action */
 
@@ -2357,7 +2373,6 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
 		newtuple = slottuple;
 	}
 
-	modifiedCols = GetModifiedColumns(relinfo, estate);
 
 	LocTriggerData.type = T_TriggerData;
 	LocTriggerData.tg_event = TRIGGER_EVENT_UPDATE |
@@ -2426,7 +2441,8 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo,
 	if (trigdesc && trigdesc->trig_update_after_row)
 	{
 		HeapTuple	trigtuple = GetTupleForTrigger(estate, NULL, relinfo,
-												   tupleid, NULL);
+												   tupleid, LockTupleExclusive,
+												   NULL);
 
 		AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_UPDATE,
 							  true, trigtuple, newtuple, recheckIndexes,
@@ -2565,6 +2581,7 @@ GetTupleForTrigger(EState *estate,
 				   EPQState *epqstate,
 				   ResultRelInfo *relinfo,
 				   ItemPointer tid,
+				   LockTupleMode lockmode,
 				   TupleTableSlot **newSlot)
 {
 	Relation	relation = relinfo->ri_RelationDesc;
@@ -2589,8 +2606,8 @@ ltrmark:;
 		tuple.t_self = *tid;
 		test = heap_lock_tuple(relation, &tuple,
 							   estate->es_output_cid,
-							   LockTupleExclusive, false /* wait */,
-							   &buffer, &hufd);
+							   lockmode, false /* wait */,
+							   false, &buffer, &hufd);
 		switch (test)
 		{
 			case HeapTupleSelfUpdated:
@@ -2630,6 +2647,7 @@ ltrmark:;
 										   epqstate,
 										   relation,
 										   relinfo->ri_RangeTableIndex,
+										   lockmode,
 										   &hufd.ctid,
 										   hufd.xmax);
 					if (!TupIsNull(epqslot))
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 2d3170a250..a37a54e5b4 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -26,6 +26,7 @@
 #include "access/genam.h"
 #include "access/heapam.h"
 #include "access/htup_details.h"
+#include "access/multixact.h"
 #include "access/transam.h"
 #include "access/xact.h"
 #include "catalog/namespace.h"
@@ -63,7 +64,7 @@ static BufferAccessStrategy vac_strategy;
 
 /* non-export function prototypes */
 static List *get_rel_oids(Oid relid, const RangeVar *vacrel);
-static void vac_truncate_clog(TransactionId frozenXID);
+static void vac_truncate_clog(TransactionId frozenXID, MultiXactId frozenMulti);
 static bool vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast,
 		   bool for_wraparound);
 
@@ -379,7 +380,8 @@ vacuum_set_xid_limits(int freeze_min_age,
 					  bool sharedRel,
 					  TransactionId *oldestXmin,
 					  TransactionId *freezeLimit,
-					  TransactionId *freezeTableLimit)
+					  TransactionId *freezeTableLimit,
+					  MultiXactId *multiXactFrzLimit)
 {
 	int			freezemin;
 	TransactionId limit;
@@ -463,8 +465,22 @@ vacuum_set_xid_limits(int freeze_min_age,
 
 		*freezeTableLimit = limit;
 	}
-}
 
+	if (multiXactFrzLimit != NULL)
+	{
+		MultiXactId	mxLimit;
+
+		/*
+		 * simplistic multixactid freezing: use the same freezing policy as
+		 * for Xids
+		 */
+		mxLimit = GetOldestMultiXactId() - freezemin;
+		if (mxLimit < FirstMultiXactId)
+			mxLimit = FirstMultiXactId;
+
+		*multiXactFrzLimit = mxLimit;
+	}
+}
 
 /*
  * vac_estimate_reltuples() -- estimate the new value for pg_class.reltuples
@@ -574,7 +590,8 @@ void
 vac_update_relstats(Relation relation,
 					BlockNumber num_pages, double num_tuples,
 					BlockNumber num_all_visible_pages,
-					bool hasindex, TransactionId frozenxid)
+					bool hasindex, TransactionId frozenxid,
+					MultiXactId minmulti)
 {
 	Oid			relid = RelationGetRelid(relation);
 	Relation	rd;
@@ -648,6 +665,14 @@ vac_update_relstats(Relation relation,
 		dirty = true;
 	}
 
+	/* relminmxid must never go backward, either */
+	if (MultiXactIdIsValid(minmulti) &&
+		MultiXactIdPrecedes(pgcform->relminmxid, minmulti))
+	{
+		pgcform->relminmxid = minmulti;
+		dirty = true;
+	}
+
 	/* If anything changed, write out the tuple. */
 	if (dirty)
 		heap_inplace_update(rd, ctup);
@@ -660,8 +685,13 @@ vac_update_relstats(Relation relation,
  *	vac_update_datfrozenxid() -- update pg_database.datfrozenxid for our DB
  *
  *		Update pg_database's datfrozenxid entry for our database to be the
- *		minimum of the pg_class.relfrozenxid values.  If we are able to
- *		advance pg_database.datfrozenxid, also try to truncate pg_clog.
+ *		minimum of the pg_class.relfrozenxid values.
+ *
+ *		Similarly, update our datfrozenmulti to be the minimum of the
+ *		pg_class.relfrozenmulti values.
+ *
+ *		If we are able to advance either pg_database value, also try to
+ *		truncate pg_clog and pg_multixact.
  *
  *		We violate transaction semantics here by overwriting the database's
  *		existing pg_database tuple with the new value.	This is reasonably
@@ -678,16 +708,23 @@ vac_update_datfrozenxid(void)
 	SysScanDesc scan;
 	HeapTuple	classTup;
 	TransactionId newFrozenXid;
+	MultiXactId	newFrozenMulti;
 	bool		dirty = false;
 
 	/*
 	 * Initialize the "min" calculation with GetOldestXmin, which is a
 	 * reasonable approximation to the minimum relfrozenxid for not-yet-
 	 * committed pg_class entries for new tables; see AddNewRelationTuple().
-	 * Se we cannot produce a wrong minimum by starting with this.
+	 * So we cannot produce a wrong minimum by starting with this.
 	 */
 	newFrozenXid = GetOldestXmin(true, true);
 
+	/*
+	 * Similarly, initialize the MultiXact "min" with the value that would
+	 * be used on pg_class for new tables.  See AddNewRelationTuple().
+	 */
+	newFrozenMulti = GetOldestMultiXactId();
+
 	/*
 	 * We must seqscan pg_class to find the minimum Xid, because there is no
 	 * index that can help us here.
@@ -710,9 +747,13 @@ vac_update_datfrozenxid(void)
 			continue;
 
 		Assert(TransactionIdIsNormal(classForm->relfrozenxid));
+		Assert(MultiXactIdIsValid(classForm->relminmxid));
 
 		if (TransactionIdPrecedes(classForm->relfrozenxid, newFrozenXid))
 			newFrozenXid = classForm->relfrozenxid;
+
+		if (MultiXactIdPrecedes(classForm->relminmxid, newFrozenMulti))
+			newFrozenMulti = classForm->relminmxid;
 	}
 
 	/* we're done with pg_class */
@@ -720,6 +761,7 @@ vac_update_datfrozenxid(void)
 	heap_close(relation, AccessShareLock);
 
 	Assert(TransactionIdIsNormal(newFrozenXid));
+	Assert(MultiXactIdIsValid(newFrozenMulti));
 
 	/* Now fetch the pg_database tuple we need to update. */
 	relation = heap_open(DatabaseRelationId, RowExclusiveLock);
@@ -740,6 +782,13 @@ vac_update_datfrozenxid(void)
 		dirty = true;
 	}
 
+	/* ditto */
+	if (MultiXactIdPrecedes(dbform->datminmxid, newFrozenMulti))
+	{
+		dbform->datminmxid = newFrozenMulti;
+		dirty = true;
+	}
+
 	if (dirty)
 		heap_inplace_update(relation, tuple);
 
@@ -752,7 +801,7 @@ vac_update_datfrozenxid(void)
 	 * this action will update that too.
 	 */
 	if (dirty || ForceTransactionIdLimitUpdate())
-		vac_truncate_clog(newFrozenXid);
+		vac_truncate_clog(newFrozenXid, newFrozenMulti);
 }
 
 
@@ -771,17 +820,19 @@ vac_update_datfrozenxid(void)
  *		info is stale.
  */
 static void
-vac_truncate_clog(TransactionId frozenXID)
+vac_truncate_clog(TransactionId frozenXID, MultiXactId frozenMulti)
 {
 	TransactionId myXID = GetCurrentTransactionId();
 	Relation	relation;
 	HeapScanDesc scan;
 	HeapTuple	tuple;
-	Oid			oldest_datoid;
+	Oid			oldestxid_datoid;
+	Oid			oldestmulti_datoid;
 	bool		frozenAlreadyWrapped = false;
 
-	/* init oldest_datoid to sync with my frozenXID */
-	oldest_datoid = MyDatabaseId;
+	/* init oldest datoids to sync with my frozen values */
+	oldestxid_datoid = MyDatabaseId;
+	oldestmulti_datoid = MyDatabaseId;
 
 	/*
 	 * Scan pg_database to compute the minimum datfrozenxid
@@ -804,13 +855,20 @@ vac_truncate_clog(TransactionId frozenXID)
 		Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);
 
 		Assert(TransactionIdIsNormal(dbform->datfrozenxid));
+		Assert(MultiXactIdIsValid(dbform->datminmxid));
 
 		if (TransactionIdPrecedes(myXID, dbform->datfrozenxid))
 			frozenAlreadyWrapped = true;
 		else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
 		{
 			frozenXID = dbform->datfrozenxid;
-			oldest_datoid = HeapTupleGetOid(tuple);
+			oldestxid_datoid = HeapTupleGetOid(tuple);
+		}
+
+		if (MultiXactIdPrecedes(dbform->datminmxid, frozenMulti))
+		{
+			frozenMulti = dbform->datminmxid;
+			oldestmulti_datoid = HeapTupleGetOid(tuple);
 		}
 	}
 
@@ -832,14 +890,18 @@ vac_truncate_clog(TransactionId frozenXID)
 		return;
 	}
 
-	/* Truncate CLOG to the oldest frozenxid */
+	/* Truncate CLOG and Multi to the oldest computed value */
 	TruncateCLOG(frozenXID);
+	TruncateMultiXact(frozenMulti);
 
 	/*
-	 * Update the wrap limit for GetNewTransactionId.  Note: this function
-	 * will also signal the postmaster for an(other) autovac cycle if needed.
+	 * Update the wrap limit for GetNewTransactionId and creation of new
+	 * MultiXactIds.  Note: these functions will also signal the postmaster for
+	 * an(other) autovac cycle if needed.   XXX should we avoid possibly
+	 * signalling twice?
 	 */
-	SetTransactionIdLimit(frozenXID, oldest_datoid);
+	SetTransactionIdLimit(frozenXID, oldestxid_datoid);
+	MultiXactAdvanceOldest(frozenMulti, oldestmulti_datoid);
 }
 
 
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 8eda66364b..5ec65ea41b 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -41,6 +41,7 @@
 #include "access/heapam.h"
 #include "access/heapam_xlog.h"
 #include "access/htup_details.h"
+#include "access/multixact.h"
 #include "access/transam.h"
 #include "access/visibilitymap.h"
 #include "catalog/storage.h"
@@ -124,6 +125,7 @@ static int	elevel = -1;
 
 static TransactionId OldestXmin;
 static TransactionId FreezeLimit;
+static MultiXactId MultiXactFrzLimit;
 
 static BufferAccessStrategy vac_strategy;
 
@@ -180,6 +182,7 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 	double		new_rel_tuples;
 	BlockNumber new_rel_allvisible;
 	TransactionId new_frozen_xid;
+	MultiXactId	new_min_multi;
 
 	/* measure elapsed time iff autovacuum logging requires it */
 	if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
@@ -197,7 +200,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 
 	vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
 						  onerel->rd_rel->relisshared,
-						  &OldestXmin, &FreezeLimit, &freezeTableLimit);
+						  &OldestXmin, &FreezeLimit, &freezeTableLimit,
+						  &MultiXactFrzLimit);
 	scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
 											 freezeTableLimit);
 
@@ -267,12 +271,17 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 	if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
 		new_frozen_xid = InvalidTransactionId;
 
+	new_min_multi = MultiXactFrzLimit;
+	if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
+		new_min_multi = InvalidMultiXactId;
+
 	vac_update_relstats(onerel,
 						new_rel_pages,
 						new_rel_tuples,
 						new_rel_allvisible,
 						vacrelstats->hasindex,
-						new_frozen_xid);
+						new_frozen_xid,
+						new_min_multi);
 
 	/*
 	 * Report results to the stats collector, too. An early terminated
@@ -839,7 +848,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 				 * Each non-removable tuple must be checked to see if it needs
 				 * freezing.  Note we already have exclusive buffer lock.
 				 */
-				if (heap_freeze_tuple(tuple.t_data, FreezeLimit))
+				if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
+									  MultiXactFrzLimit))
 					frozen[nfrozen++] = offnum;
 			}
 		}						/* scan along page */
@@ -857,7 +867,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 				XLogRecPtr	recptr;
 
 				recptr = log_heap_freeze(onerel, buf, FreezeLimit,
-										 frozen, nfrozen);
+										 MultiXactFrzLimit, frozen, nfrozen);
 				PageSetLSN(page, recptr);
 				PageSetTLI(page, ThisTimeLineID);
 			}
@@ -1176,7 +1186,8 @@ lazy_check_needs_freeze(Buffer buf)
 
 		tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
 
-		if (heap_tuple_needs_freeze(tupleheader, FreezeLimit, buf))
+		if (heap_tuple_needs_freeze(tupleheader, FreezeLimit,
+									MultiXactFrzLimit, buf))
 			return true;
 	}							/* scan along page */
 
@@ -1253,7 +1264,8 @@ lazy_cleanup_index(Relation indrel,
 							stats->num_index_tuples,
 							0,
 							false,
-							InvalidTransactionId);
+							InvalidTransactionId,
+							InvalidMultiXactId);
 
 	ereport(elevel,
 			(errmsg("index \"%s\" now contains %.0f row versions in %u pages",
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 9d5d829406..23a6a61256 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -162,7 +162,8 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
 		case CMD_SELECT:
 
 			/*
-			 * SELECT FOR UPDATE/SHARE and modifying CTEs need to mark tuples
+			 * SELECT FOR [KEY] UPDATE/SHARE and modifying CTEs need to mark
+			 * tuples
 			 */
 			if (queryDesc->plannedstmt->rowMarks != NIL ||
 				queryDesc->plannedstmt->hasModifyingCTE)
@@ -775,7 +776,7 @@ InitPlan(QueryDesc *queryDesc, int eflags)
 	}
 
 	/*
-	 * Similarly, we have to lock relations selected FOR UPDATE/FOR SHARE
+	 * Similarly, we have to lock relations selected FOR [KEY] UPDATE/SHARE
 	 * before we initialize the plan tree, else we'd be risking lock upgrades.
 	 * While we are at it, build the ExecRowMark list.
 	 */
@@ -794,7 +795,9 @@ InitPlan(QueryDesc *queryDesc, int eflags)
 		switch (rc->markType)
 		{
 			case ROW_MARK_EXCLUSIVE:
+			case ROW_MARK_NOKEYEXCLUSIVE:
 			case ROW_MARK_SHARE:
+			case ROW_MARK_KEYSHARE:
 				relid = getrelid(rc->rti, rangeTable);
 				relation = heap_open(relid, RowShareLock);
 				break;
@@ -1341,7 +1344,7 @@ ExecEndPlan(PlanState *planstate, EState *estate)
 	}
 
 	/*
-	 * close any relations selected FOR UPDATE/FOR SHARE, again keeping locks
+	 * close any relations selected FOR [KEY] UPDATE/SHARE, again keeping locks
 	 */
 	foreach(l, estate->es_rowMarks)
 	{
@@ -1694,6 +1697,7 @@ ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist)
  *	epqstate - state for EvalPlanQual rechecking
  *	relation - table containing tuple
  *	rti - rangetable index of table containing tuple
+ *	lockmode - requested tuple lock mode
  *	*tid - t_ctid from the outdated tuple (ie, next updated version)
  *	priorXmax - t_xmax from the outdated tuple
  *
@@ -1702,10 +1706,13 @@ ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist)
  *
  * Returns a slot containing the new candidate update/delete tuple, or
  * NULL if we determine we shouldn't process the row.
+ *
+ * Note: properly, lockmode should be declared as enum LockTupleMode,
+ * but we use "int" to avoid having to include heapam.h in executor.h.
  */
 TupleTableSlot *
 EvalPlanQual(EState *estate, EPQState *epqstate,
-			 Relation relation, Index rti,
+			 Relation relation, Index rti, int lockmode,
 			 ItemPointer tid, TransactionId priorXmax)
 {
 	TupleTableSlot *slot;
@@ -1716,7 +1723,7 @@ EvalPlanQual(EState *estate, EPQState *epqstate,
 	/*
 	 * Get and lock the updated version of the row; if fail, return NULL.
 	 */
-	copyTuple = EvalPlanQualFetch(estate, relation, LockTupleExclusive,
+	copyTuple = EvalPlanQualFetch(estate, relation, lockmode,
 								  tid, priorXmax);
 
 	if (copyTuple == NULL)
@@ -1864,7 +1871,7 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode,
 			test = heap_lock_tuple(relation, &tuple,
 								   estate->es_output_cid,
 								   lockmode, false /* wait */,
-								   &buffer, &hufd);
+								   false, &buffer, &hufd);
 			/* We now have two pins on the buffer, get rid of one */
 			ReleaseBuffer(buffer);
 
@@ -1965,7 +1972,7 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode,
 		/* updated, so look at the updated row */
 		tuple.t_self = tuple.t_data->t_ctid;
 		/* updated row should have xmin matching this xmax */
-		priorXmax = HeapTupleHeaderGetXmax(tuple.t_data);
+		priorXmax = HeapTupleHeaderGetUpdateXid(tuple.t_data);
 		ReleaseBuffer(buffer);
 		/* loop back to fetch next in chain */
 	}
diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c
index 9b9d7941c8..ae2d26b48b 100644
--- a/src/backend/executor/nodeLockRows.c
+++ b/src/backend/executor/nodeLockRows.c
@@ -111,14 +111,29 @@ lnext:
 		tuple.t_self = *((ItemPointer) DatumGetPointer(datum));
 
 		/* okay, try to lock the tuple */
-		if (erm->markType == ROW_MARK_EXCLUSIVE)
-			lockmode = LockTupleExclusive;
-		else
-			lockmode = LockTupleShared;
+		switch (erm->markType)
+		{
+			case ROW_MARK_EXCLUSIVE:
+				lockmode = LockTupleExclusive;
+				break;
+			case ROW_MARK_NOKEYEXCLUSIVE:
+				lockmode = LockTupleNoKeyExclusive;
+				break;
+			case ROW_MARK_SHARE:
+				lockmode = LockTupleShare;
+				break;
+			case ROW_MARK_KEYSHARE:
+				lockmode = LockTupleKeyShare;
+				break;
+			default:
+				elog(ERROR, "unsupported rowmark type");
+				lockmode = LockTupleNoKeyExclusive;	/* keep compiler quiet */
+				break;
+		}
 
 		test = heap_lock_tuple(erm->relation, &tuple,
 							   estate->es_output_cid,
-							   lockmode, erm->noWait,
+							   lockmode, erm->noWait, true,
 							   &buffer, &hufd);
 		ReleaseBuffer(buffer);
 		switch (test)
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 40cc423c76..cb084d03d4 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -403,6 +403,7 @@ ldelete:;
 										   epqstate,
 										   resultRelationDesc,
 										   resultRelInfo->ri_RangeTableIndex,
+										   LockTupleExclusive,
 										   &hufd.ctid,
 										   hufd.xmax);
 					if (!TupIsNull(epqslot))
@@ -569,6 +570,8 @@ ExecUpdate(ItemPointer tupleid,
 	}
 	else
 	{
+		LockTupleMode	lockmode;
+
 		/*
 		 * Check the constraints of the tuple
 		 *
@@ -595,7 +598,7 @@ lreplace:;
 							 estate->es_output_cid,
 							 estate->es_crosscheck_snapshot,
 							 true /* wait for commit */,
-							 &hufd);
+							 &hufd, &lockmode);
 		switch (result)
 		{
 			case HeapTupleSelfUpdated:
@@ -647,6 +650,7 @@ lreplace:;
 										   epqstate,
 										   resultRelationDesc,
 										   resultRelInfo->ri_RangeTableIndex,
+										   lockmode,
 										   &hufd.ctid,
 										   hufd.xmax);
 					if (!TupIsNull(epqslot))
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 9a01ec6d59..2da08d1cc1 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -2037,7 +2037,7 @@ _copyRowMarkClause(const RowMarkClause *from)
 	RowMarkClause *newnode = makeNode(RowMarkClause);
 
 	COPY_SCALAR_FIELD(rti);
-	COPY_SCALAR_FIELD(forUpdate);
+	COPY_SCALAR_FIELD(strength);
 	COPY_SCALAR_FIELD(noWait);
 	COPY_SCALAR_FIELD(pushedDown);
 
@@ -2400,7 +2400,7 @@ _copyLockingClause(const LockingClause *from)
 	LockingClause *newnode = makeNode(LockingClause);
 
 	COPY_NODE_FIELD(lockedRels);
-	COPY_SCALAR_FIELD(forUpdate);
+	COPY_SCALAR_FIELD(strength);
 	COPY_SCALAR_FIELD(noWait);
 
 	return newnode;
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 034159da31..9e313c8b1b 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -2210,7 +2210,7 @@ static bool
 _equalLockingClause(const LockingClause *a, const LockingClause *b)
 {
 	COMPARE_NODE_FIELD(lockedRels);
-	COMPARE_SCALAR_FIELD(forUpdate);
+	COMPARE_SCALAR_FIELD(strength);
 	COMPARE_SCALAR_FIELD(noWait);
 
 	return true;
@@ -2283,7 +2283,7 @@ static bool
 _equalRowMarkClause(const RowMarkClause *a, const RowMarkClause *b)
 {
 	COMPARE_SCALAR_FIELD(rti);
-	COMPARE_SCALAR_FIELD(forUpdate);
+	COMPARE_SCALAR_FIELD(strength);
 	COMPARE_SCALAR_FIELD(noWait);
 	COMPARE_SCALAR_FIELD(pushedDown);
 
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 484e426489..ffd123d506 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -2111,7 +2111,7 @@ _outLockingClause(StringInfo str, const LockingClause *node)
 	WRITE_NODE_TYPE("LOCKINGCLAUSE");
 
 	WRITE_NODE_FIELD(lockedRels);
-	WRITE_BOOL_FIELD(forUpdate);
+	WRITE_ENUM_FIELD(strength, LockClauseStrength);
 	WRITE_BOOL_FIELD(noWait);
 }
 
@@ -2289,7 +2289,7 @@ _outRowMarkClause(StringInfo str, const RowMarkClause *node)
 	WRITE_NODE_TYPE("ROWMARKCLAUSE");
 
 	WRITE_UINT_FIELD(rti);
-	WRITE_BOOL_FIELD(forUpdate);
+	WRITE_ENUM_FIELD(strength, LockClauseStrength);
 	WRITE_BOOL_FIELD(noWait);
 	WRITE_BOOL_FIELD(pushedDown);
 }
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index ed2354144c..472c82361a 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -301,7 +301,7 @@ _readRowMarkClause(void)
 	READ_LOCALS(RowMarkClause);
 
 	READ_UINT_FIELD(rti);
-	READ_BOOL_FIELD(forUpdate);
+	READ_ENUM_FIELD(strength, LockClauseStrength);
 	READ_BOOL_FIELD(noWait);
 	READ_BOOL_FIELD(pushedDown);
 
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c
index 774b9d627d..11d951cabe 100644
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -861,11 +861,11 @@ make_outerjoininfo(PlannerInfo *root,
 	Assert(jointype != JOIN_RIGHT);
 
 	/*
-	 * Presently the executor cannot support FOR UPDATE/SHARE marking of rels
+	 * Presently the executor cannot support FOR [KEY] UPDATE/SHARE marking of rels
 	 * appearing on the nullable side of an outer join. (It's somewhat unclear
 	 * what that would mean, anyway: what should we mark when a result row is
 	 * generated from no element of the nullable relation?)  So, complain if
-	 * any nullable rel is FOR UPDATE/SHARE.
+	 * any nullable rel is FOR [KEY] UPDATE/SHARE.
 	 *
 	 * You might be wondering why this test isn't made far upstream in the
 	 * parser.	It's because the parser hasn't got enough info --- consider
@@ -883,7 +883,7 @@ make_outerjoininfo(PlannerInfo *root,
 			(jointype == JOIN_FULL && bms_is_member(rc->rti, left_rels)))
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("SELECT FOR UPDATE/SHARE cannot be applied to the nullable side of an outer join")));
+					 errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be applied to the nullable side of an outer join")));
 	}
 
 	sjinfo->syn_lefthand = left_rels;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index de975d8791..3e75d3994c 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -562,7 +562,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
 				returningLists = NIL;
 
 			/*
-			 * If there was a FOR UPDATE/SHARE clause, the LockRows node will
+			 * If there was a FOR [KEY] UPDATE/SHARE clause, the LockRows node will
 			 * have dealt with fetching non-locked marked rows, else we need
 			 * to have ModifyTable do that.
 			 */
@@ -954,7 +954,7 @@ inheritance_planner(PlannerInfo *root)
 	root->simple_rel_array = save_rel_array;
 
 	/*
-	 * If there was a FOR UPDATE/SHARE clause, the LockRows node will have
+	 * If there was a FOR [KEY] UPDATE/SHARE clause, the LockRows node will have
 	 * dealt with fetching non-locked marked rows, else we need to have
 	 * ModifyTable do that.
 	 */
@@ -1065,13 +1065,13 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 										tlist);
 
 		/*
-		 * Can't handle FOR UPDATE/SHARE here (parser should have checked
+		 * Can't handle FOR [KEY] UPDATE/SHARE here (parser should have checked
 		 * already, but let's make sure).
 		 */
 		if (parse->rowMarks)
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("SELECT FOR UPDATE/SHARE is not allowed with UNION/INTERSECT/EXCEPT")));
+					 errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE is not allowed with UNION/INTERSECT/EXCEPT")));
 
 		/*
 		 * Calculate pathkeys that represent result ordering requirements
@@ -1797,7 +1797,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 	}
 
 	/*
-	 * If there is a FOR UPDATE/SHARE clause, add the LockRows node. (Note: we
+	 * If there is a FOR [KEY] UPDATE/SHARE clause, add the LockRows node. (Note: we
 	 * intentionally test parse->rowMarks not root->rowMarks here. If there
 	 * are only non-locking rowmarks, they should be handled by the
 	 * ModifyTable node instead.)
@@ -1983,7 +1983,7 @@ preprocess_rowmarks(PlannerInfo *root)
 	if (parse->rowMarks)
 	{
 		/*
-		 * We've got trouble if FOR UPDATE/SHARE appears inside grouping,
+		 * We've got trouble if FOR [KEY] UPDATE/SHARE appears inside grouping,
 		 * since grouping renders a reference to individual tuple CTIDs
 		 * invalid.  This is also checked at parse time, but that's
 		 * insufficient because of rule substitution, query pullup, etc.
@@ -1993,7 +1993,7 @@ preprocess_rowmarks(PlannerInfo *root)
 	else
 	{
 		/*
-		 * We only need rowmarks for UPDATE, DELETE, or FOR UPDATE/SHARE.
+		 * We only need rowmarks for UPDATE, DELETE, or FOR [KEY] UPDATE/SHARE.
 		 */
 		if (parse->commandType != CMD_UPDATE &&
 			parse->commandType != CMD_DELETE)
@@ -2003,7 +2003,7 @@ preprocess_rowmarks(PlannerInfo *root)
 	/*
 	 * We need to have rowmarks for all base relations except the target. We
 	 * make a bitmapset of all base rels and then remove the items we don't
-	 * need or have FOR UPDATE/SHARE marks for.
+	 * need or have FOR [KEY] UPDATE/SHARE marks for.
 	 */
 	rels = get_base_rel_indexes((Node *) parse->jointree);
 	if (parse->resultRelation)
@@ -2020,7 +2020,7 @@ preprocess_rowmarks(PlannerInfo *root)
 		PlanRowMark *newrc;
 
 		/*
-		 * Currently, it is syntactically impossible to have FOR UPDATE
+		 * Currently, it is syntactically impossible to have FOR UPDATE et al
 		 * applied to an update/delete target rel.	If that ever becomes
 		 * possible, we should drop the target from the PlanRowMark list.
 		 */
@@ -2040,10 +2040,21 @@ preprocess_rowmarks(PlannerInfo *root)
 		newrc = makeNode(PlanRowMark);
 		newrc->rti = newrc->prti = rc->rti;
 		newrc->rowmarkId = ++(root->glob->lastRowMarkId);
-		if (rc->forUpdate)
-			newrc->markType = ROW_MARK_EXCLUSIVE;
-		else
-			newrc->markType = ROW_MARK_SHARE;
+		switch (rc->strength)
+		{
+			case LCS_FORUPDATE:
+				newrc->markType = ROW_MARK_EXCLUSIVE;
+				break;
+			case LCS_FORNOKEYUPDATE:
+				newrc->markType = ROW_MARK_NOKEYEXCLUSIVE;
+				break;
+			case LCS_FORSHARE:
+				newrc->markType = ROW_MARK_SHARE;
+				break;
+			case LCS_FORKEYSHARE:
+				newrc->markType = ROW_MARK_KEYSHARE;
+				break;
+		}
 		newrc->noWait = rc->noWait;
 		newrc->isParent = false;
 
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index 5aa6ecce7b..ede41af6db 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -2139,7 +2139,7 @@ transformCreateTableAsStmt(ParseState *pstate, CreateTableAsStmt *stmt)
 
 
 /*
- * Check for features that are not supported together with FOR UPDATE/SHARE.
+ * Check for features that are not supported together with FOR [KEY] UPDATE/SHARE.
  *
  * exported so planner can check again after rewriting, query pullup, etc
  */
@@ -2149,35 +2149,35 @@ CheckSelectLocking(Query *qry)
 	if (qry->setOperations)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("SELECT FOR UPDATE/SHARE is not allowed with UNION/INTERSECT/EXCEPT")));
+				 errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with UNION/INTERSECT/EXCEPT")));
 	if (qry->distinctClause != NIL)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("SELECT FOR UPDATE/SHARE is not allowed with DISTINCT clause")));
+				 errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with DISTINCT clause")));
 	if (qry->groupClause != NIL)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("SELECT FOR UPDATE/SHARE is not allowed with GROUP BY clause")));
+				 errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with GROUP BY clause")));
 	if (qry->havingQual != NULL)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-		errmsg("SELECT FOR UPDATE/SHARE is not allowed with HAVING clause")));
+		errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with HAVING clause")));
 	if (qry->hasAggs)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("SELECT FOR UPDATE/SHARE is not allowed with aggregate functions")));
+				 errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with aggregate functions")));
 	if (qry->hasWindowFuncs)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("SELECT FOR UPDATE/SHARE is not allowed with window functions")));
+				 errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with window functions")));
 	if (expression_returns_set((Node *) qry->targetList))
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("SELECT FOR UPDATE/SHARE is not allowed with set-returning functions in the target list")));
+				 errmsg("SELECT FOR UPDATE/SHARE/FOR KEY UPDATE/FOR KEY SHARE is not allowed with set-returning functions in the target list")));
 }
 
 /*
- * Transform a FOR UPDATE/SHARE clause
+ * Transform a FOR [KEY] UPDATE/SHARE clause
  *
  * This basically involves replacing names by integer relids.
  *
@@ -2199,7 +2199,7 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
 	/* make a clause we can pass down to subqueries to select all rels */
 	allrels = makeNode(LockingClause);
 	allrels->lockedRels = NIL;	/* indicates all rels */
-	allrels->forUpdate = lc->forUpdate;
+	allrels->strength = lc->strength;
 	allrels->noWait = lc->noWait;
 
 	if (lockedRels == NIL)
@@ -2218,15 +2218,15 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
 					if (rte->relkind == RELKIND_FOREIGN_TABLE)
 						break;
 					applyLockingClause(qry, i,
-									   lc->forUpdate, lc->noWait, pushedDown);
+									   lc->strength, lc->noWait, pushedDown);
 					rte->requiredPerms |= ACL_SELECT_FOR_UPDATE;
 					break;
 				case RTE_SUBQUERY:
 					applyLockingClause(qry, i,
-									   lc->forUpdate, lc->noWait, pushedDown);
+									   lc->strength, lc->noWait, pushedDown);
 
 					/*
-					 * FOR UPDATE/SHARE of subquery is propagated to all of
+					 * FOR [KEY] UPDATE/SHARE of subquery is propagated to all of
 					 * subquery's rels, too.  We could do this later (based on
 					 * the marking of the subquery RTE) but it is convenient
 					 * to have local knowledge in each query level about which
@@ -2252,7 +2252,7 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
 			if (thisrel->catalogname || thisrel->schemaname)
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
-						 errmsg("SELECT FOR UPDATE/SHARE must specify unqualified relation names"),
+						 errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE must specify unqualified relation names"),
 						 parser_errposition(pstate, thisrel->location)));
 
 			i = 0;
@@ -2269,17 +2269,17 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
 							if (rte->relkind == RELKIND_FOREIGN_TABLE)
 								ereport(ERROR,
 									 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-									  errmsg("SELECT FOR UPDATE/SHARE cannot be used with foreign table \"%s\"",
+									  errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be used with foreign table \"%s\"",
 											 rte->eref->aliasname),
 									  parser_errposition(pstate, thisrel->location)));
 							applyLockingClause(qry, i,
-											   lc->forUpdate, lc->noWait,
+											   lc->strength, lc->noWait,
 											   pushedDown);
 							rte->requiredPerms |= ACL_SELECT_FOR_UPDATE;
 							break;
 						case RTE_SUBQUERY:
 							applyLockingClause(qry, i,
-											   lc->forUpdate, lc->noWait,
+											   lc->strength, lc->noWait,
 											   pushedDown);
 							/* see comment above */
 							transformLockingClause(pstate, rte->subquery,
@@ -2288,25 +2288,25 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
 						case RTE_JOIN:
 							ereport(ERROR,
 									(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-									 errmsg("SELECT FOR UPDATE/SHARE cannot be applied to a join"),
+									 errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be applied to a join"),
 							 parser_errposition(pstate, thisrel->location)));
 							break;
 						case RTE_FUNCTION:
 							ereport(ERROR,
 									(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-									 errmsg("SELECT FOR UPDATE/SHARE cannot be applied to a function"),
+									 errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be applied to a function"),
 							 parser_errposition(pstate, thisrel->location)));
 							break;
 						case RTE_VALUES:
 							ereport(ERROR,
 									(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-									 errmsg("SELECT FOR UPDATE/SHARE cannot be applied to VALUES"),
+									 errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be applied to VALUES"),
 							 parser_errposition(pstate, thisrel->location)));
 							break;
 						case RTE_CTE:
 							ereport(ERROR,
 									(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-									 errmsg("SELECT FOR UPDATE/SHARE cannot be applied to a WITH query"),
+									 errmsg("SELECT FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE cannot be applied to a WITH query"),
 							 parser_errposition(pstate, thisrel->location)));
 							break;
 						default:
@@ -2320,7 +2320,7 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
 			if (rt == NULL)
 				ereport(ERROR,
 						(errcode(ERRCODE_UNDEFINED_TABLE),
-						 errmsg("relation \"%s\" in FOR UPDATE/SHARE clause not found in FROM clause",
+						 errmsg("relation \"%s\" in FOR UPDATE/SHARE/KEY UPDATE/KEY SHARE clause not found in FROM clause",
 								thisrel->relname),
 						 parser_errposition(pstate, thisrel->location)));
 		}
@@ -2332,7 +2332,7 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc,
  */
 void
 applyLockingClause(Query *qry, Index rtindex,
-				   bool forUpdate, bool noWait, bool pushedDown)
+				   LockClauseStrength strength, bool noWait, bool pushedDown)
 {
 	RowMarkClause *rc;
 
@@ -2344,10 +2344,10 @@ applyLockingClause(Query *qry, Index rtindex,
 	if ((rc = get_parse_rowmark(qry, rtindex)) != NULL)
 	{
 		/*
-		 * If the same RTE is specified both FOR UPDATE and FOR SHARE, treat
-		 * it as FOR UPDATE.  (Reasonable, since you can't take both a shared
-		 * and exclusive lock at the same time; it'll end up being exclusive
-		 * anyway.)
+		 * If the same RTE is specified for more than one locking strength,
+		 * treat is as the strongest.  (Reasonable, since you can't take both a
+		 * shared and exclusive lock at the same time; it'll end up being
+		 * exclusive anyway.)
 		 *
 		 * We also consider that NOWAIT wins if it's specified both ways. This
 		 * is a bit more debatable but raising an error doesn't seem helpful.
@@ -2356,7 +2356,7 @@ applyLockingClause(Query *qry, Index rtindex,
 		 *
 		 * And of course pushedDown becomes false if any clause is explicit.
 		 */
-		rc->forUpdate |= forUpdate;
+		rc->strength = Max(rc->strength, strength);
 		rc->noWait |= noWait;
 		rc->pushedDown &= pushedDown;
 		return;
@@ -2365,7 +2365,7 @@ applyLockingClause(Query *qry, Index rtindex,
 	/* Make a new RowMarkClause */
 	rc = makeNode(RowMarkClause);
 	rc->rti = rtindex;
-	rc->forUpdate = forUpdate;
+	rc->strength = strength;
 	rc->noWait = noWait;
 	rc->pushedDown = pushedDown;
 	qry->rowMarks = lappend(qry->rowMarks, rc);
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index b19afa88e7..828e11058e 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -361,6 +361,7 @@ static void processCASbits(int cas_bits, int location, const char *constrType,
 %type <ival>	 OptTemp
 %type <oncommit> OnCommitOption
 
+%type <ival>	for_locking_strength
 %type <node>	for_locking_item
 %type <list>	for_locking_clause opt_for_locking_clause for_locking_items
 %type <list>	locked_rels_list
@@ -8900,9 +8901,10 @@ select_with_parens:
  * The duplicative productions are annoying, but hard to get rid of without
  * creating shift/reduce conflicts.
  *
- *	FOR UPDATE/SHARE may be before or after LIMIT/OFFSET.
+ *	The locking clause (FOR UPDATE etc) may be before or after LIMIT/OFFSET.
  *	In <=7.2.X, LIMIT/OFFSET had to be after FOR UPDATE
- *	We now support both orderings, but prefer LIMIT/OFFSET before FOR UPDATE/SHARE
+ *	We now support both orderings, but prefer LIMIT/OFFSET before the locking
+ * clause.
  *	2002-08-28 bjm
  */
 select_no_parens:
@@ -9321,24 +9323,23 @@ for_locking_items:
 		;
 
 for_locking_item:
-			FOR UPDATE locked_rels_list opt_nowait
+			for_locking_strength locked_rels_list opt_nowait
 				{
 					LockingClause *n = makeNode(LockingClause);
-					n->lockedRels = $3;
-					n->forUpdate = TRUE;
-					n->noWait = $4;
-					$$ = (Node *) n;
-				}
-			| FOR SHARE locked_rels_list opt_nowait
-				{
-					LockingClause *n = makeNode(LockingClause);
-					n->lockedRels = $3;
-					n->forUpdate = FALSE;
-					n->noWait = $4;
+					n->lockedRels = $2;
+					n->strength = $1;
+					n->noWait = $3;
 					$$ = (Node *) n;
 				}
 		;
 
+for_locking_strength:
+			FOR UPDATE 							{ $$ = LCS_FORUPDATE; }
+			| FOR NO KEY UPDATE 				{ $$ = LCS_FORNOKEYUPDATE; }
+			| FOR SHARE 						{ $$ = LCS_FORSHARE; }
+			| FOR KEY SHARE 					{ $$ = LCS_FORKEYSHARE; }
+		;
+
 locked_rels_list:
 			OF qualified_name_list					{ $$ = $2; }
 			| /* EMPTY */							{ $$ = NIL; }
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 564edf2e55..7ab0801887 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -69,6 +69,7 @@
 
 #include "access/heapam.h"
 #include "access/htup_details.h"
+#include "access/multixact.h"
 #include "access/reloptions.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -136,8 +137,9 @@ static volatile sig_atomic_t got_SIGHUP = false;
 static volatile sig_atomic_t got_SIGUSR2 = false;
 static volatile sig_atomic_t got_SIGTERM = false;
 
-/* Comparison point for determining whether freeze_max_age is exceeded */
+/* Comparison points for determining whether freeze_max_age is exceeded */
 static TransactionId recentXid;
+static MultiXactId recentMulti;
 
 /* Default freeze ages to use for autovacuum (varies by database) */
 static int	default_freeze_min_age;
@@ -161,6 +163,7 @@ typedef struct avw_dbase
 	Oid			adw_datid;
 	char	   *adw_name;
 	TransactionId adw_frozenxid;
+	MultiXactId	adw_frozenmulti;
 	PgStat_StatDBEntry *adw_entry;
 } avw_dbase;
 
@@ -1076,7 +1079,9 @@ do_start_worker(void)
 	List	   *dblist;
 	ListCell   *cell;
 	TransactionId xidForceLimit;
+	MultiXactId multiForceLimit;
 	bool		for_xid_wrap;
+	bool		for_multi_wrap;
 	avw_dbase  *avdb;
 	TimestampTz current_time;
 	bool		skipit = false;
@@ -1122,12 +1127,20 @@ do_start_worker(void)
 	if (xidForceLimit < FirstNormalTransactionId)
 		xidForceLimit -= FirstNormalTransactionId;
 
+	/* Also determine the oldest datminmxid we will consider. */
+	recentMulti = ReadNextMultiXactId();
+	multiForceLimit = recentMulti - autovacuum_freeze_max_age;
+	if (multiForceLimit < FirstMultiXactId)
+		multiForceLimit -= FirstMultiXactId;
+
 	/*
 	 * Choose a database to connect to.  We pick the database that was least
 	 * recently auto-vacuumed, or one that needs vacuuming to prevent Xid
-	 * wraparound-related data loss.  If any db at risk of wraparound is
+	 * wraparound-related data loss.  If any db at risk of Xid wraparound is
 	 * found, we pick the one with oldest datfrozenxid, independently of
-	 * autovacuum times.
+	 * autovacuum times; similarly we pick the one with the oldest datminmxid
+	 * if any is in MultiXactId wraparound.  Note that those in Xid wraparound
+	 * danger are given more priority than those in multi wraparound danger.
 	 *
 	 * Note that a database with no stats entry is not considered, except for
 	 * Xid wraparound purposes.  The theory is that if no one has ever
@@ -1143,6 +1156,7 @@ do_start_worker(void)
 	 */
 	avdb = NULL;
 	for_xid_wrap = false;
+	for_multi_wrap = false;
 	current_time = GetCurrentTimestamp();
 	foreach(cell, dblist)
 	{
@@ -1153,13 +1167,25 @@ do_start_worker(void)
 		if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit))
 		{
 			if (avdb == NULL ||
-			  TransactionIdPrecedes(tmp->adw_frozenxid, avdb->adw_frozenxid))
+				TransactionIdPrecedes(tmp->adw_frozenxid,
+									  avdb->adw_frozenxid))
 				avdb = tmp;
 			for_xid_wrap = true;
 			continue;
 		}
 		else if (for_xid_wrap)
 			continue;			/* ignore not-at-risk DBs */
+		else if (MultiXactIdPrecedes(tmp->adw_frozenmulti, multiForceLimit))
+		{
+			if (avdb == NULL ||
+				MultiXactIdPrecedes(tmp->adw_frozenmulti,
+									avdb->adw_frozenmulti))
+				avdb = tmp;
+			for_multi_wrap = true;
+			continue;
+		}
+		else if (for_multi_wrap)
+			continue;			/* ignore not-at-risk DBs */
 
 		/* Find pgstat entry if any */
 		tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid);
@@ -1642,6 +1668,7 @@ AutoVacWorkerMain(int argc, char *argv[])
 
 		/* And do an appropriate amount of work */
 		recentXid = ReadNewTransactionId();
+		recentMulti = ReadNextMultiXactId();
 		do_autovacuum();
 	}
 
@@ -1847,6 +1874,7 @@ get_database_list(void)
 		avdb->adw_datid = HeapTupleGetOid(tup);
 		avdb->adw_name = pstrdup(NameStr(pgdatabase->datname));
 		avdb->adw_frozenxid = pgdatabase->datfrozenxid;
+		avdb->adw_frozenmulti = pgdatabase->datminmxid;
 		/* this gets set later: */
 		avdb->adw_entry = NULL;
 
@@ -2601,6 +2629,7 @@ relation_needs_vacanalyze(Oid relid,
 	/* freeze parameters */
 	int			freeze_max_age;
 	TransactionId xidForceLimit;
+	MultiXactId	multiForceLimit;
 
 	AssertArg(classForm != NULL);
 	AssertArg(OidIsValid(relid));
@@ -2641,6 +2670,14 @@ relation_needs_vacanalyze(Oid relid,
 	force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) &&
 					TransactionIdPrecedes(classForm->relfrozenxid,
 										  xidForceLimit));
+	if (!force_vacuum)
+	{
+		multiForceLimit = recentMulti - autovacuum_freeze_max_age;
+		if (multiForceLimit < FirstMultiXactId)
+			multiForceLimit -= FirstMultiXactId;
+		force_vacuum = MultiXactIdPrecedes(classForm->relminmxid,
+										   multiForceLimit);
+	}
 	*wraparound = force_vacuum;
 
 	/* User disabled it in pg_class.reloptions?  (But ignore if at risk) */
diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c
index 33df6aecd6..b458de6971 100644
--- a/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@ -55,7 +55,7 @@ static void rewriteValuesRTE(RangeTblEntry *rte, Relation target_relation,
 static void rewriteTargetListUD(Query *parsetree, RangeTblEntry *target_rte,
 					Relation target_relation);
 static void markQueryForLocking(Query *qry, Node *jtnode,
-					bool forUpdate, bool noWait, bool pushedDown);
+					LockClauseStrength strength, bool noWait, bool pushedDown);
 static List *matchLocks(CmdType event, RuleLock *rulelocks,
 		   int varno, Query *parsetree);
 static Query *fireRIRrules(Query *parsetree, List *activeRIRs,
@@ -68,7 +68,7 @@ static Query *fireRIRrules(Query *parsetree, List *activeRIRs,
  *	  These locks will ensure that the relation schemas don't change under us
  *	  while we are rewriting and planning the query.
  *
- * forUpdatePushedDown indicates that a pushed-down FOR UPDATE/SHARE applies
+ * forUpdatePushedDown indicates that a pushed-down FOR [KEY] UPDATE/SHARE applies
  * to the current subquery, requiring all rels to be opened with RowShareLock.
  * This should always be false at the start of the recursion.
  *
@@ -130,7 +130,7 @@ AcquireRewriteLocks(Query *parsetree, bool forUpdatePushedDown)
 				 *
 				 * If the relation is the query's result relation, then we
 				 * need RowExclusiveLock.  Otherwise, check to see if the
-				 * relation is accessed FOR UPDATE/SHARE or not.  We can't
+				 * relation is accessed FOR [KEY] UPDATE/SHARE or not.  We can't
 				 * just grab AccessShareLock because then the executor would
 				 * be trying to upgrade the lock, leading to possible
 				 * deadlocks.
@@ -1357,7 +1357,7 @@ ApplyRetrieveRule(Query *parsetree,
 	}
 
 	/*
-	 * If FOR UPDATE/SHARE of view, be sure we get right initial lock on the
+	 * If FOR [KEY] UPDATE/SHARE of view, be sure we get right initial lock on the
 	 * relations it references.
 	 */
 	rc = get_parse_rowmark(parsetree, rt_index);
@@ -1405,8 +1405,8 @@ ApplyRetrieveRule(Query *parsetree,
 	rte->modifiedCols = NULL;
 
 	/*
-	 * If FOR UPDATE/SHARE of view, mark all the contained tables as implicit
-	 * FOR UPDATE/SHARE, the same as the parser would have done if the view's
+	 * If FOR [KEY] UPDATE/SHARE of view, mark all the contained tables as implicit
+	 * FOR [KEY] UPDATE/SHARE, the same as the parser would have done if the view's
 	 * subquery had been written out explicitly.
 	 *
 	 * Note: we don't consider forUpdatePushedDown here; such marks will be
@@ -1414,13 +1414,13 @@ ApplyRetrieveRule(Query *parsetree,
 	 */
 	if (rc != NULL)
 		markQueryForLocking(rule_action, (Node *) rule_action->jointree,
-							rc->forUpdate, rc->noWait, true);
+							rc->strength, rc->noWait, true);
 
 	return parsetree;
 }
 
 /*
- * Recursively mark all relations used by a view as FOR UPDATE/SHARE.
+ * Recursively mark all relations used by a view as FOR [KEY] UPDATE/SHARE.
  *
  * This may generate an invalid query, eg if some sub-query uses an
  * aggregate.  We leave it to the planner to detect that.
@@ -1432,7 +1432,7 @@ ApplyRetrieveRule(Query *parsetree,
  */
 static void
 markQueryForLocking(Query *qry, Node *jtnode,
-					bool forUpdate, bool noWait, bool pushedDown)
+					LockClauseStrength strength, bool noWait, bool pushedDown)
 {
 	if (jtnode == NULL)
 		return;
@@ -1446,16 +1446,16 @@ markQueryForLocking(Query *qry, Node *jtnode,
 			/* ignore foreign tables */
 			if (rte->relkind != RELKIND_FOREIGN_TABLE)
 			{
-				applyLockingClause(qry, rti, forUpdate, noWait, pushedDown);
+				applyLockingClause(qry, rti, strength, noWait, pushedDown);
 				rte->requiredPerms |= ACL_SELECT_FOR_UPDATE;
 			}
 		}
 		else if (rte->rtekind == RTE_SUBQUERY)
 		{
-			applyLockingClause(qry, rti, forUpdate, noWait, pushedDown);
-			/* FOR UPDATE/SHARE of subquery is propagated to subquery's rels */
+			applyLockingClause(qry, rti, strength, noWait, pushedDown);
+			/* FOR [KEY] UPDATE/SHARE of subquery is propagated to subquery's rels */
 			markQueryForLocking(rte->subquery, (Node *) rte->subquery->jointree,
-								forUpdate, noWait, true);
+								strength, noWait, true);
 		}
 		/* other RTE types are unaffected by FOR UPDATE */
 	}
@@ -1465,14 +1465,14 @@ markQueryForLocking(Query *qry, Node *jtnode,
 		ListCell   *l;
 
 		foreach(l, f->fromlist)
-			markQueryForLocking(qry, lfirst(l), forUpdate, noWait, pushedDown);
+			markQueryForLocking(qry, lfirst(l), strength, noWait, pushedDown);
 	}
 	else if (IsA(jtnode, JoinExpr))
 	{
 		JoinExpr   *j = (JoinExpr *) jtnode;
 
-		markQueryForLocking(qry, j->larg, forUpdate, noWait, pushedDown);
-		markQueryForLocking(qry, j->rarg, forUpdate, noWait, pushedDown);
+		markQueryForLocking(qry, j->larg, strength, noWait, pushedDown);
+		markQueryForLocking(qry, j->rarg, strength, noWait, pushedDown);
 	}
 	else
 		elog(ERROR, "unrecognized node type: %d",
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 84637fe581..2c1c652489 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -538,6 +538,20 @@ ProcLockHashCode(const PROCLOCKTAG *proclocktag, uint32 hashcode)
 	return lockhash;
 }
 
+/*
+ * Given two lock modes, return whether they would conflict.
+ */
+bool
+DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
+{
+	LockMethod	lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD];
+
+	if (lockMethodTable->conflictTab[mode1] & LOCKBIT_ON(mode2))
+		return true;
+
+	return false;
+}
+
 /*
  * LockHasWaiters -- look up 'locktag' and check if releasing this
  *		lock would wake up other processes waiting for it.
@@ -630,7 +644,6 @@ LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
 	return hasWaiters;
 }
 
-
 /*
  * LockAcquire -- Check for lock conflicts, sleep if conflict found,
  *		set lock if/when no conflicts.
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 90a9e2a915..51c350797d 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -3905,10 +3905,10 @@ CheckForSerializableConflictOut(bool visible, Relation relation,
 		case HEAPTUPLE_RECENTLY_DEAD:
 			if (!visible)
 				return;
-			xid = HeapTupleHeaderGetXmax(tuple->t_data);
+			xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
 			break;
 		case HEAPTUPLE_DELETE_IN_PROGRESS:
-			xid = HeapTupleHeaderGetXmax(tuple->t_data);
+			xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
 			break;
 		case HEAPTUPLE_INSERT_IN_PROGRESS:
 			xid = HeapTupleHeaderGetXmin(tuple->t_data);
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 598e20f91c..8904c6f2da 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -131,7 +131,7 @@ CommandIsReadOnly(Node *parsetree)
 		{
 			case CMD_SELECT:
 				if (stmt->rowMarks != NIL)
-					return false;		/* SELECT FOR UPDATE/SHARE */
+					return false;		/* SELECT FOR [KEY] UPDATE/SHARE */
 				else if (stmt->hasModifyingCTE)
 					return false;		/* data-modifying CTE */
 				else
@@ -2283,10 +2283,28 @@ CreateCommandTag(Node *parsetree)
 						else if (stmt->rowMarks != NIL)
 						{
 							/* not 100% but probably close enough */
-							if (((PlanRowMark *) linitial(stmt->rowMarks))->markType == ROW_MARK_EXCLUSIVE)
-								tag = "SELECT FOR UPDATE";
-							else
-								tag = "SELECT FOR SHARE";
+							switch (((PlanRowMark *) linitial(stmt->rowMarks))->markType)
+							{
+								case ROW_MARK_EXCLUSIVE:
+									tag = "SELECT FOR UPDATE";
+									break;
+								case ROW_MARK_NOKEYEXCLUSIVE:
+									tag = "SELECT FOR NO KEY UPDATE";
+									break;
+								case ROW_MARK_SHARE:
+									tag = "SELECT FOR SHARE";
+									break;
+								case ROW_MARK_KEYSHARE:
+									tag = "SELECT FOR KEY SHARE";
+									break;
+								case ROW_MARK_REFERENCE:
+								case ROW_MARK_COPY:
+									tag = "SELECT";
+									break;
+								default:
+									tag = "???";
+									break;
+							}
 						}
 						else
 							tag = "SELECT";
@@ -2331,10 +2349,24 @@ CreateCommandTag(Node *parsetree)
 						else if (stmt->rowMarks != NIL)
 						{
 							/* not 100% but probably close enough */
-							if (((RowMarkClause *) linitial(stmt->rowMarks))->forUpdate)
-								tag = "SELECT FOR UPDATE";
-							else
-								tag = "SELECT FOR SHARE";
+							switch (((RowMarkClause *) linitial(stmt->rowMarks))->strength)
+							{
+								case LCS_FORKEYSHARE:
+									tag = "SELECT FOR KEY SHARE";
+									break;
+								case LCS_FORSHARE:
+									tag = "SELECT FOR SHARE";
+									break;
+								case LCS_FORNOKEYUPDATE:
+									tag = "SELECT FOR NO KEY UPDATE";
+									break;
+								case LCS_FORUPDATE:
+									tag = "SELECT FOR UPDATE";
+									break;
+								default:
+									tag =  "???";
+									break;
+							}
 						}
 						else
 							tag = "SELECT";
diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c
index 601d5ec861..243bdebbd2 100644
--- a/src/backend/utils/adt/ri_triggers.c
+++ b/src/backend/utils/adt/ri_triggers.c
@@ -299,7 +299,7 @@ RI_FKey_check(TriggerData *trigdata)
 	 * Get the relation descriptors of the FK and PK tables.
 	 *
 	 * pk_rel is opened in RowShareLock mode since that's what our eventual
-	 * SELECT FOR SHARE will get on it.
+	 * SELECT FOR KEY SHARE will get on it.
 	 */
 	fk_rel = trigdata->tg_relation;
 	pk_rel = heap_open(riinfo->pk_relid, RowShareLock);
@@ -400,7 +400,8 @@ RI_FKey_check(TriggerData *trigdata)
 
 		/* ----------
 		 * The query string built is
-		 *	SELECT 1 FROM ONLY <pktable> WHERE pkatt1 = $1 [AND ...] FOR SHARE
+		 *	SELECT 1 FROM ONLY <pktable> x WHERE pkatt1 = $1 [AND ...]
+		 *	       FOR KEY SHARE OF x
 		 * The type id's for the $ parameters are those of the
 		 * corresponding FK attributes.
 		 * ----------
@@ -424,7 +425,7 @@ RI_FKey_check(TriggerData *trigdata)
 			querysep = "AND";
 			queryoids[i] = fk_type;
 		}
-		appendStringInfo(&querybuf, " FOR SHARE OF x");
+		appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
 		/* Prepare and save the plan */
 		qplan = ri_PlanCheck(querybuf.data, riinfo->nkeys, queryoids,
@@ -535,7 +536,8 @@ ri_Check_Pk_Match(Relation pk_rel, Relation fk_rel,
 
 		/* ----------
 		 * The query string built is
-		 *	SELECT 1 FROM ONLY <pktable> WHERE pkatt1 = $1 [AND ...] FOR SHARE
+		 *	SELECT 1 FROM ONLY <pktable> x WHERE pkatt1 = $1 [AND ...]
+		 *	       FOR KEY SHARE OF x
 		 * The type id's for the $ parameters are those of the
 		 * PK attributes themselves.
 		 * ----------
@@ -558,7 +560,7 @@ ri_Check_Pk_Match(Relation pk_rel, Relation fk_rel,
 			querysep = "AND";
 			queryoids[i] = pk_type;
 		}
-		appendStringInfo(&querybuf, " FOR SHARE OF x");
+		appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
 		/* Prepare and save the plan */
 		qplan = ri_PlanCheck(querybuf.data, riinfo->nkeys, queryoids,
@@ -655,7 +657,7 @@ ri_restrict_del(TriggerData *trigdata, bool is_no_action)
 	 * Get the relation descriptors of the FK and PK tables and the old tuple.
 	 *
 	 * fk_rel is opened in RowShareLock mode since that's what our eventual
-	 * SELECT FOR SHARE will get on it.
+	 * SELECT FOR KEY SHARE will get on it.
 	 */
 	fk_rel = heap_open(riinfo->fk_relid, RowShareLock);
 	pk_rel = trigdata->tg_relation;
@@ -724,7 +726,8 @@ ri_restrict_del(TriggerData *trigdata, bool is_no_action)
 
 				/* ----------
 				 * The query string built is
-				 *	SELECT 1 FROM ONLY <fktable> WHERE $1 = fkatt1 [AND ...]
+				 *	SELECT 1 FROM ONLY <fktable> x WHERE $1 = fkatt1 [AND ...]
+				 *	       FOR KEY SHARE OF x
 				 * The type id's for the $ parameters are those of the
 				 * corresponding PK attributes.
 				 * ----------
@@ -749,7 +752,7 @@ ri_restrict_del(TriggerData *trigdata, bool is_no_action)
 					querysep = "AND";
 					queryoids[i] = pk_type;
 				}
-				appendStringInfo(&querybuf, " FOR SHARE OF x");
+				appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
 				/* Prepare and save the plan */
 				qplan = ri_PlanCheck(querybuf.data, riinfo->nkeys, queryoids,
@@ -868,7 +871,7 @@ ri_restrict_upd(TriggerData *trigdata, bool is_no_action)
 	 * old tuple.
 	 *
 	 * fk_rel is opened in RowShareLock mode since that's what our eventual
-	 * SELECT FOR SHARE will get on it.
+	 * SELECT FOR KEY SHARE will get on it.
 	 */
 	fk_rel = heap_open(riinfo->fk_relid, RowShareLock);
 	pk_rel = trigdata->tg_relation;
@@ -972,7 +975,7 @@ ri_restrict_upd(TriggerData *trigdata, bool is_no_action)
 					querysep = "AND";
 					queryoids[i] = pk_type;
 				}
-				appendStringInfo(&querybuf, " FOR SHARE OF x");
+				appendStringInfo(&querybuf, " FOR KEY SHARE OF x");
 
 				/* Prepare and save the plan */
 				qplan = ri_PlanCheck(querybuf.data, riinfo->nkeys, queryoids,
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index af10471581..16f56c6ade 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -4194,7 +4194,7 @@ get_select_query_def(Query *query, deparse_context *context,
 			get_rule_expr(query->limitCount, context, false);
 	}
 
-	/* Add FOR UPDATE/SHARE clauses if present */
+	/* Add FOR [KEY] UPDATE/SHARE clauses if present */
 	if (query->hasForUpdate)
 	{
 		foreach(l, query->rowMarks)
@@ -4205,12 +4205,26 @@ get_select_query_def(Query *query, deparse_context *context,
 			if (rc->pushedDown)
 				continue;
 
-			if (rc->forUpdate)
-				appendContextKeyword(context, " FOR UPDATE",
-									 -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
-			else
-				appendContextKeyword(context, " FOR SHARE",
-									 -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+			switch (rc->strength)
+			{
+				case LCS_FORKEYSHARE:
+					appendContextKeyword(context, " FOR KEY SHARE",
+										 -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+					break;
+				case LCS_FORSHARE:
+					appendContextKeyword(context, " FOR SHARE",
+										 -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+					break;
+				case LCS_FORNOKEYUPDATE:
+					appendContextKeyword(context, " FOR NO KEY UPDATE",
+										 -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+					break;
+				case LCS_FORUPDATE:
+					appendContextKeyword(context, " FOR UPDATE",
+										 -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+					break;
+			}
+
 			appendStringInfo(buf, " OF %s",
 							 quote_identifier(get_rtable_name(rc->rti,
 															  context)));
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 40238e959e..fa48b1ce1a 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -30,10 +30,11 @@
 #include <fcntl.h>
 #include <unistd.h>
 
+#include "access/htup_details.h"
+#include "access/multixact.h"
 #include "access/reloptions.h"
 #include "access/sysattr.h"
 #include "access/transam.h"
-#include "access/htup_details.h"
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "catalog/index.h"
@@ -2725,7 +2726,8 @@ RelationBuildLocalRelation(const char *relname,
  * the XIDs that will be put into the new relation contents.
  */
 void
-RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid)
+RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid,
+						  MultiXactId minmulti)
 {
 	Oid			newrelfilenode;
 	RelFileNodeBackend newrnode;
@@ -2738,6 +2740,7 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid)
 			relation->rd_rel->relkind == RELKIND_SEQUENCE) ?
 		   freezeXid == InvalidTransactionId :
 		   TransactionIdIsNormal(freezeXid));
+	Assert(TransactionIdIsNormal(freezeXid) == MultiXactIdIsValid(minmulti));
 
 	/* Allocate a new relfilenode */
 	newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL,
@@ -2793,6 +2796,7 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid)
 		classform->relallvisible = 0;
 	}
 	classform->relfrozenxid = freezeXid;
+	classform->relminmxid = minmulti;
 
 	simple_heap_update(pg_class, &tuple->t_self, tuple);
 	CatalogUpdateIndexes(pg_class, tuple);
@@ -3764,6 +3768,9 @@ RelationGetIndexPredicate(Relation relation)
  * simple index keys, but attributes used in expressions and partial-index
  * predicates.)
  *
+ * If "keyAttrs" is true, only attributes that can be referenced by foreign
+ * keys are considered.
+ *
  * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that
  * we can include system attributes (e.g., OID) in the bitmap representation.
  *
@@ -3775,16 +3782,17 @@ RelationGetIndexPredicate(Relation relation)
  * be bms_free'd when not needed anymore.
  */
 Bitmapset *
-RelationGetIndexAttrBitmap(Relation relation)
+RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs)
 {
 	Bitmapset  *indexattrs;
+	Bitmapset  *uindexattrs;
 	List	   *indexoidlist;
 	ListCell   *l;
 	MemoryContext oldcxt;
 
 	/* Quick exit if we already computed the result. */
 	if (relation->rd_indexattr != NULL)
-		return bms_copy(relation->rd_indexattr);
+		return bms_copy(keyAttrs ? relation->rd_keyattr : relation->rd_indexattr);
 
 	/* Fast path if definitely no indexes */
 	if (!RelationGetForm(relation)->relhasindex)
@@ -3810,26 +3818,38 @@ RelationGetIndexAttrBitmap(Relation relation)
 	 * won't be returned at all by RelationGetIndexList.
 	 */
 	indexattrs = NULL;
+	uindexattrs = NULL;
 	foreach(l, indexoidlist)
 	{
 		Oid			indexOid = lfirst_oid(l);
 		Relation	indexDesc;
 		IndexInfo  *indexInfo;
 		int			i;
+		bool		isKey;
 
 		indexDesc = index_open(indexOid, AccessShareLock);
 
 		/* Extract index key information from the index's pg_index row */
 		indexInfo = BuildIndexInfo(indexDesc);
 
+		/* Can this index be referenced by a foreign key? */
+		isKey = indexInfo->ii_Unique &&
+				indexInfo->ii_Expressions == NIL &&
+				indexInfo->ii_Predicate == NIL;
+
 		/* Collect simple attribute references */
 		for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
 		{
 			int			attrnum = indexInfo->ii_KeyAttrNumbers[i];
 
 			if (attrnum != 0)
+			{
 				indexattrs = bms_add_member(indexattrs,
 							   attrnum - FirstLowInvalidHeapAttributeNumber);
+				if (isKey)
+					uindexattrs = bms_add_member(uindexattrs,
+												 attrnum - FirstLowInvalidHeapAttributeNumber);
+			}
 		}
 
 		/* Collect all attributes used in expressions, too */
@@ -3846,10 +3866,11 @@ RelationGetIndexAttrBitmap(Relation relation)
 	/* Now save a copy of the bitmap in the relcache entry. */
 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
 	relation->rd_indexattr = bms_copy(indexattrs);
+	relation->rd_keyattr = bms_copy(uindexattrs);
 	MemoryContextSwitchTo(oldcxt);
 
 	/* We return our original working copy for caller to play with */
-	return indexattrs;
+	return keyAttrs ? uindexattrs : indexattrs;
 }
 
 /*
diff --git a/src/backend/utils/time/combocid.c b/src/backend/utils/time/combocid.c
index 38f702892f..923355d3ce 100644
--- a/src/backend/utils/time/combocid.c
+++ b/src/backend/utils/time/combocid.c
@@ -118,9 +118,8 @@ HeapTupleHeaderGetCmax(HeapTupleHeader tup)
 {
 	CommandId	cid = HeapTupleHeaderGetRawCommandId(tup);
 
-	/* We do not store cmax when locking a tuple */
-	Assert(!(tup->t_infomask & (HEAP_MOVED | HEAP_IS_LOCKED)));
-	Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tup)));
+	Assert(!(tup->t_infomask & HEAP_MOVED));
+	Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tup)));
 
 	if (tup->t_infomask & HEAP_COMBOCID)
 		return GetRealCmax(cid);
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index 51f0afded9..f2c9ff2e1c 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -214,12 +214,25 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return true;
 
-			if (tuple->t_infomask & HEAP_IS_LOCKED)		/* not deleter */
+			if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))		/* not deleter */
 				return true;
 
-			Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+			{
+				TransactionId	xmax;
+
+				xmax = HeapTupleGetUpdateXid(tuple);
+				if (!TransactionIdIsValid(xmax))
+					return true;
+
+				/* updating subtransaction must have aborted */
+				if (!TransactionIdIsCurrentTransactionId(xmax))
+					return true;
+				else
+					return false;
+			}
 
-			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
 			{
 				/* deleting subtransaction must have aborted */
 				SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -250,29 +263,41 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
 	if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 			return true;
 		return false;			/* updated by other */
 	}
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		/* MultiXacts are currently only allowed to lock tuples */
-		Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+		TransactionId	xmax;
+
+		if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+			return true;
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+		if (!TransactionIdIsValid(xmax))
+			return true;
+		if (TransactionIdIsCurrentTransactionId(xmax))
+			return false;
+		if (TransactionIdIsInProgress(xmax))
+			return true;
+		if (TransactionIdDidCommit(xmax))
+			return false;
 		return true;
 	}
 
-	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 			return true;
 		return false;
 	}
 
-	if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+	if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
 		return true;
 
-	if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+	if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
 	{
 		/* it must have aborted or crashed */
 		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -282,7 +307,7 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
 	/* xmax transaction committed */
 
-	if (tuple->t_infomask & HEAP_IS_LOCKED)
+	if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 	{
 		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
 					InvalidTransactionId);
@@ -290,7 +315,7 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 	}
 
 	SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-				HeapTupleHeaderGetXmax(tuple));
+				HeapTupleHeaderGetRawXmax(tuple));
 	return false;
 }
 
@@ -380,12 +405,25 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return true;
 
-			if (tuple->t_infomask & HEAP_IS_LOCKED)		/* not deleter */
+			if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))		/* not deleter */
 				return true;
 
-			Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+			{
+				TransactionId	xmax;
 
-			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+				xmax = HeapTupleGetUpdateXid(tuple);
+				if (!TransactionIdIsValid(xmax))
+					return true;
+
+				/* updating subtransaction must have aborted */
+				if (!TransactionIdIsCurrentTransactionId(xmax))
+					return true;
+				else
+					return false;
+			}
+
+			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
 			{
 				/* deleting subtransaction must have aborted */
 				SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -419,21 +457,38 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
 	if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 			return true;
 		return false;
 	}
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		/* MultiXacts are currently only allowed to lock tuples */
-		Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+		TransactionId	xmax;
+
+		if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+			return true;
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+		if (!TransactionIdIsValid(xmax))
+			return true;
+		if (TransactionIdIsCurrentTransactionId(xmax))
+		{
+			if (HeapTupleHeaderGetCmax(tuple) >= GetCurrentCommandId(false))
+				return true;	/* deleted after scan started */
+			else
+				return false;	/* deleted before scan started */
+		}
+		if (TransactionIdIsInProgress(xmax))
+			return true;
+		if (TransactionIdDidCommit(xmax))
+			return false;
 		return true;
 	}
 
-	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 			return true;
 		if (HeapTupleHeaderGetCmax(tuple) >= GetCurrentCommandId(false))
 			return true;		/* deleted after scan started */
@@ -441,10 +496,10 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 			return false;		/* deleted before scan started */
 	}
 
-	if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+	if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
 		return true;
 
-	if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+	if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
 	{
 		/* it must have aborted or crashed */
 		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -454,7 +509,7 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 
 	/* xmax transaction committed */
 
-	if (tuple->t_infomask & HEAP_IS_LOCKED)
+	if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 	{
 		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
 					InvalidTransactionId);
@@ -462,7 +517,7 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
 	}
 
 	SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-				HeapTupleHeaderGetXmax(tuple));
+				HeapTupleHeaderGetRawXmax(tuple));
 	return false;
 }
 
@@ -627,12 +682,30 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return HeapTupleMayBeUpdated;
 
-			if (tuple->t_infomask & HEAP_IS_LOCKED)		/* not deleter */
+			if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))		 /* not deleter */
 				return HeapTupleMayBeUpdated;
 
-			Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+			{
+				TransactionId	xmax;
+
+				xmax = HeapTupleGetUpdateXid(tuple);
+				if (!TransactionIdIsValid(xmax))
+					return HeapTupleMayBeUpdated;
+
+				/* updating subtransaction must have aborted */
+				if (!TransactionIdIsCurrentTransactionId(xmax))
+					return HeapTupleMayBeUpdated;
+				else
+				{
+					if (HeapTupleHeaderGetCmax(tuple) >= curcid)
+						return HeapTupleSelfUpdated;	/* updated after scan started */
+					else
+						return HeapTupleInvisible;	/* updated before scan started */
+				}
+			}
 
-			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
 			{
 				/* deleting subtransaction must have aborted */
 				SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -666,26 +739,62 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 
 	if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 			return HeapTupleMayBeUpdated;
 		return HeapTupleUpdated;	/* updated by other */
 	}
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		/* MultiXacts are currently only allowed to lock tuples */
-		Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+		TransactionId	xmax;
 
-		if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple)))
+		if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+		{
+			/*
+			 * If it's only locked but neither EXCL_LOCK nor KEYSHR_LOCK
+			 * is set, it cannot possibly be running.  Otherwise need to
+			 * check.
+			 */
+			if ((tuple->t_infomask & (HEAP_XMAX_EXCL_LOCK |
+									  HEAP_XMAX_KEYSHR_LOCK)) &&
+				MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple)))
+				return HeapTupleBeingUpdated;
+
+			SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId);
+			return HeapTupleMayBeUpdated;
+		}
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+		if (!TransactionIdIsValid(xmax))
+		{
+			if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple)))
+				return HeapTupleBeingUpdated;
+
+			SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId);
+			return HeapTupleMayBeUpdated;
+		}
+
+		if (TransactionIdIsCurrentTransactionId(xmax))
+		{
+			if (HeapTupleHeaderGetCmax(tuple) >= curcid)
+				return HeapTupleSelfUpdated;		/* updated after scan started */
+			else
+				return HeapTupleInvisible;	/* updated before scan started */
+		}
+
+		if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple)))
 			return HeapTupleBeingUpdated;
-		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-					InvalidTransactionId);
+
+		if (TransactionIdDidCommit(xmax))
+			return HeapTupleUpdated;
+		/* it must have aborted or crashed */
+		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId);
 		return HeapTupleMayBeUpdated;
 	}
 
-	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 			return HeapTupleMayBeUpdated;
 		if (HeapTupleHeaderGetCmax(tuple) >= curcid)
 			return HeapTupleSelfUpdated;		/* updated after scan started */
@@ -693,10 +802,10 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 			return HeapTupleInvisible;	/* updated before scan started */
 	}
 
-	if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+	if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
 		return HeapTupleBeingUpdated;
 
-	if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+	if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
 	{
 		/* it must have aborted or crashed */
 		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -706,7 +815,7 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 
 	/* xmax transaction committed */
 
-	if (tuple->t_infomask & HEAP_IS_LOCKED)
+	if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 	{
 		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
 					InvalidTransactionId);
@@ -714,7 +823,7 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
 	}
 
 	SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-				HeapTupleHeaderGetXmax(tuple));
+				HeapTupleHeaderGetRawXmax(tuple));
 	return HeapTupleUpdated;	/* updated by other */
 }
 
@@ -793,12 +902,25 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return true;
 
-			if (tuple->t_infomask & HEAP_IS_LOCKED)		/* not deleter */
+			if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))		 /* not deleter */
 				return true;
 
-			Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+			{
+				TransactionId	xmax;
 
-			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+				xmax = HeapTupleGetUpdateXid(tuple);
+				if (!TransactionIdIsValid(xmax))
+					return true;
+
+				/* updating subtransaction must have aborted */
+				if (!TransactionIdIsCurrentTransactionId(xmax))
+					return true;
+				else
+					return false;
+			}
+
+			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
 			{
 				/* deleting subtransaction must have aborted */
 				SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -833,32 +955,47 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 
 	if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 			return true;
 		return false;			/* updated by other */
 	}
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		/* MultiXacts are currently only allowed to lock tuples */
-		Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+		TransactionId	xmax;
+
+		if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+			return true;
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+		if (!TransactionIdIsValid(xmax))
+			return true;
+		if (TransactionIdIsCurrentTransactionId(xmax))
+			return false;
+		if (TransactionIdIsInProgress(xmax))
+		{
+			snapshot->xmax = xmax;
+			return true;
+		}
+		if (TransactionIdDidCommit(xmax))
+			return false;
 		return true;
 	}
 
-	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+	if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
 	{
-		if (tuple->t_infomask & HEAP_IS_LOCKED)
+		if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 			return true;
 		return false;
 	}
 
-	if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+	if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
 	{
-		snapshot->xmax = HeapTupleHeaderGetXmax(tuple);
+		snapshot->xmax = HeapTupleHeaderGetRawXmax(tuple);
 		return true;
 	}
 
-	if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+	if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
 	{
 		/* it must have aborted or crashed */
 		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -868,7 +1005,7 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 
 	/* xmax transaction committed */
 
-	if (tuple->t_infomask & HEAP_IS_LOCKED)
+	if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 	{
 		SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
 					InvalidTransactionId);
@@ -876,7 +1013,7 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
 	}
 
 	SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-				HeapTupleHeaderGetXmax(tuple));
+				HeapTupleHeaderGetRawXmax(tuple));
 	return false;				/* updated by other */
 }
 
@@ -957,12 +1094,27 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return true;
 
-			if (tuple->t_infomask & HEAP_IS_LOCKED)		/* not deleter */
+			if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))		 /* not deleter */
 				return true;
 
-			Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
+			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+			{
+				TransactionId	xmax;
+
+				xmax = HeapTupleGetUpdateXid(tuple);
+				if (!TransactionIdIsValid(xmax))
+					return true;
+
+				/* updating subtransaction must have aborted */
+				if (!TransactionIdIsCurrentTransactionId(xmax))
+					return true;
+				else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+					return true;	/* updated after scan started */
+				else
+					return false;	/* updated before scan started */
+			}
 
-			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
 			{
 				/* deleting subtransaction must have aborted */
 				SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -999,19 +1151,41 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
 	if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid or aborted */
 		return true;
 
-	if (tuple->t_infomask & HEAP_IS_LOCKED)
+	if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 		return true;
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		/* MultiXacts are currently only allowed to lock tuples */
-		Assert(tuple->t_infomask & HEAP_IS_LOCKED);
+		TransactionId	xmax;
+
+		/* already checked above */
+		Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+		if (!TransactionIdIsValid(xmax))
+			return true;
+		if (TransactionIdIsCurrentTransactionId(xmax))
+		{
+			if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+				return true;	/* deleted after scan started */
+			else
+				return false;	/* deleted before scan started */
+		}
+		if (TransactionIdIsInProgress(xmax))
+			return true;
+		if (TransactionIdDidCommit(xmax))
+		{
+			/* updating transaction committed, but when? */
+			if (XidInMVCCSnapshot(xmax, snapshot))
+				return true;	/* treat as still in progress */
+			return false;
+		}
 		return true;
 	}
 
 	if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
 	{
-		if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
+		if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
 		{
 			if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
 				return true;	/* deleted after scan started */
@@ -1019,10 +1193,10 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
 				return false;	/* deleted before scan started */
 		}
 
-		if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+		if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
 			return true;
 
-		if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+		if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
 		{
 			/* it must have aborted or crashed */
 			SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
@@ -1032,13 +1206,13 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
 
 		/* xmax transaction committed */
 		SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-					HeapTupleHeaderGetXmax(tuple));
+					HeapTupleHeaderGetRawXmax(tuple));
 	}
 
 	/*
 	 * OK, the deleting transaction committed too ... but when?
 	 */
-	if (XidInMVCCSnapshot(HeapTupleHeaderGetXmax(tuple), snapshot))
+	if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot))
 		return true;			/* treat as still in progress */
 
 	return false;
@@ -1112,7 +1286,7 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 		{
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return HEAPTUPLE_INSERT_IN_PROGRESS;
-			if (tuple->t_infomask & HEAP_IS_LOCKED)
+			if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 				return HEAPTUPLE_INSERT_IN_PROGRESS;
 			/* inserted and then deleted by same xact */
 			return HEAPTUPLE_DELETE_IN_PROGRESS;
@@ -1144,7 +1318,7 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 	if (tuple->t_infomask & HEAP_XMAX_INVALID)
 		return HEAPTUPLE_LIVE;
 
-	if (tuple->t_infomask & HEAP_IS_LOCKED)
+	if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
 	{
 		/*
 		 * "Deleting" xact really only locked it, so the tuple is live in any
@@ -1158,40 +1332,96 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 		{
 			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 			{
-				if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple)))
+				/*
+				 * If it's only locked but neither EXCL_LOCK nor KEYSHR_LOCK
+				 * are set, it cannot possibly be running; otherwise have to
+				 * check.
+				 */
+				if ((tuple->t_infomask & (HEAP_XMAX_EXCL_LOCK |
+										  HEAP_XMAX_KEYSHR_LOCK)) &&
+					MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple)))
 					return HEAPTUPLE_LIVE;
+				SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId);
+
 			}
 			else
 			{
-				if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+				if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
 					return HEAPTUPLE_LIVE;
+				SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							InvalidTransactionId);
 			}
-
-			/*
-			 * We don't really care whether xmax did commit, abort or crash.
-			 * We know that xmax did lock the tuple, but it did not and will
-			 * never actually update it.
-			 */
-			SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-						InvalidTransactionId);
 		}
+
+		/*
+		 * We don't really care whether xmax did commit, abort or crash.
+		 * We know that xmax did lock the tuple, but it did not and will
+		 * never actually update it.
+		 */
+
 		return HEAPTUPLE_LIVE;
 	}
 
 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 	{
-		/* MultiXacts are currently only allowed to lock tuples */
-		Assert(tuple->t_infomask & HEAP_IS_LOCKED);
-		return HEAPTUPLE_LIVE;
+		TransactionId xmax;
+
+		if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple)))
+		{
+			/* already checked above */
+			Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
+
+			xmax = HeapTupleGetUpdateXid(tuple);
+			if (!TransactionIdIsValid(xmax))
+				return HEAPTUPLE_LIVE;
+			if (TransactionIdIsInProgress(xmax))
+				return HEAPTUPLE_DELETE_IN_PROGRESS;
+			else if (TransactionIdDidCommit(xmax))
+				/* there are still lockers around -- can't return DEAD here */
+				return HEAPTUPLE_RECENTLY_DEAD;
+			/* updating transaction aborted */
+			return HEAPTUPLE_LIVE;
+		}
+
+		Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED));
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+		if (!TransactionIdIsValid(xmax))
+			return HEAPTUPLE_LIVE;
+		/* multi is not running -- updating xact cannot be */
+		Assert(!TransactionIdIsInProgress(xmax));
+		if (TransactionIdDidCommit(xmax))
+		{
+			if (!TransactionIdPrecedes(xmax, OldestXmin))
+				return HEAPTUPLE_RECENTLY_DEAD;
+			else
+				return HEAPTUPLE_DEAD;
+		}
+		else
+		{
+			/*
+			 * Not in Progress, Not Committed, so either Aborted or crashed.
+			 */
+			SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId);
+			return HEAPTUPLE_LIVE;
+		}
+
+		/*
+		 * Deleter committed, but perhaps it was recent enough that some open
+		 * transactions could still see the tuple.
+		 */
+
+		/* Otherwise, it's dead and removable */
+		return HEAPTUPLE_DEAD;
 	}
 
 	if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
 	{
-		if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
+		if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
 			return HEAPTUPLE_DELETE_IN_PROGRESS;
-		else if (TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
+		else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
 			SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-						HeapTupleHeaderGetXmax(tuple));
+						HeapTupleHeaderGetRawXmax(tuple));
 		else
 		{
 			/*
@@ -1213,7 +1443,7 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
 	 * Deleter committed, but perhaps it was recent enough that some open
 	 * transactions could still see the tuple.
 	 */
-	if (!TransactionIdPrecedes(HeapTupleHeaderGetXmax(tuple), OldestXmin))
+	if (!TransactionIdPrecedes(HeapTupleHeaderGetRawXmax(tuple), OldestXmin))
 		return HEAPTUPLE_RECENTLY_DEAD;
 
 	/* Otherwise, it's dead and removable */
@@ -1246,11 +1476,22 @@ HeapTupleIsSurelyDead(HeapTupleHeader tuple, TransactionId OldestXmin)
 
 	/*
 	 * If the inserting transaction committed, but any deleting transaction
-	 * aborted, the tuple is still alive.  Likewise, if XMAX is a lock rather
-	 * than a delete, the tuple is still alive.
+	 * aborted, the tuple is still alive.
 	 */
-	if (tuple->t_infomask &
-		(HEAP_XMAX_INVALID | HEAP_IS_LOCKED | HEAP_XMAX_IS_MULTI))
+	if (tuple->t_infomask & HEAP_XMAX_INVALID)
+		return false;
+
+	/*
+	 * If the XMAX is just a lock, the tuple is still alive.
+	 */
+	if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+		return false;
+
+	/*
+	 * If the Xmax is a MultiXact, it might be dead or alive, but we cannot
+	 * know without checking pg_multixact.
+	 */
+	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
 		return false;
 
 	/* If deleter isn't known to have committed, assume it's still running. */
@@ -1258,7 +1499,7 @@ HeapTupleIsSurelyDead(HeapTupleHeader tuple, TransactionId OldestXmin)
 		return false;
 
 	/* Deleter committed, so tuple is dead if the XID is old enough. */
-	return TransactionIdPrecedes(HeapTupleHeaderGetXmax(tuple), OldestXmin);
+	return TransactionIdPrecedes(HeapTupleHeaderGetRawXmax(tuple), OldestXmin);
 }
 
 /*
@@ -1375,3 +1616,54 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 
 	return false;
 }
+
+/*
+ * Is the tuple really only locked?  That is, is it not updated?
+ *
+ * It's easy to check just infomask bits if the locker is not a multi; but
+ * otherwise we need to verify that the updating transaction has not aborted.
+ *
+ * This function is here because it follows the same time qualification rules
+ * laid out at the top of this file.
+ */
+bool
+HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
+{
+	TransactionId	xmax;
+
+	/* if there's no valid Xmax, then there's obviously no update either */
+	if (tuple->t_infomask & HEAP_XMAX_INVALID)
+		return true;
+
+	if (tuple->t_infomask & HEAP_XMAX_LOCK_ONLY)
+		return true;
+
+	/* invalid xmax means no update */
+	if (!TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
+		return true;
+
+	/*
+	 * if HEAP_XMAX_LOCK_ONLY is not set and not a multi, then this
+	 * must necessarily have been updated
+	 */
+	if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+		return false;
+
+	/* ... but if it's a multi, then perhaps the updating Xid aborted. */
+	xmax = HeapTupleGetUpdateXid(tuple);
+	if (!TransactionIdIsValid(xmax))	/* shouldn't happen .. */
+		return true;
+
+	if (TransactionIdIsCurrentTransactionId(xmax))
+		return false;
+	if (TransactionIdIsInProgress(xmax))
+		return false;
+	if (TransactionIdDidCommit(xmax))
+		return false;
+
+	/*
+	 * not current, not in progress, not committed -- must have aborted or
+	 * crashed
+	 */
+	return true;
+}
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index 0fe68bb9e1..6a82ea18ed 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -232,6 +232,10 @@ main(int argc, char *argv[])
 		   ControlFile.checkPointCopy.oldestXidDB);
 	printf(_("Latest checkpoint's oldestActiveXID:  %u\n"),
 		   ControlFile.checkPointCopy.oldestActiveXid);
+	printf(_("Latest checkpoint's oldestMultiXact:  %u\n"),
+		   ControlFile.checkPointCopy.oldestMulti);
+	printf(_("Latest checkpoint's oldestMulti's DB: %u\n"),
+		   ControlFile.checkPointCopy.oldestMultiDB);
 	printf(_("Time of latest checkpoint:            %s\n"),
 		   ckpttime_str);
 	printf(_("Min recovery ending location:         %X/%X\n"),
diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c
index 8734f2c868..8e7fe7eb72 100644
--- a/src/bin/pg_resetxlog/pg_resetxlog.c
+++ b/src/bin/pg_resetxlog/pg_resetxlog.c
@@ -85,10 +85,12 @@ main(int argc, char *argv[])
 	TransactionId set_xid = 0;
 	Oid			set_oid = 0;
 	MultiXactId set_mxid = 0;
+	MultiXactId set_oldestmxid = 0;
 	MultiXactOffset set_mxoff = (MultiXactOffset) -1;
 	uint32		minXlogTli = 0;
 	XLogSegNo	minXlogSegNo = 0;
 	char	   *endptr;
+	char	   *endptr2;
 	char	   *DataDir;
 	int			fd;
 
@@ -170,7 +172,15 @@ main(int argc, char *argv[])
 
 			case 'm':
 				set_mxid = strtoul(optarg, &endptr, 0);
-				if (endptr == optarg || *endptr != '\0')
+				if (endptr == optarg || *endptr != ',')
+				{
+					fprintf(stderr, _("%s: invalid argument for option -m\n"), progname);
+					fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
+					exit(1);
+				}
+
+				set_oldestmxid = strtoul(endptr + 1, &endptr2, 0);
+				if (endptr2 == endptr + 1 || *endptr2 != '\0')
 				{
 					fprintf(stderr, _("%s: invalid argument for option -m\n"), progname);
 					fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
@@ -181,6 +191,16 @@ main(int argc, char *argv[])
 					fprintf(stderr, _("%s: multitransaction ID (-m) must not be 0\n"), progname);
 					exit(1);
 				}
+				/*
+				 * XXX It'd be nice to have more sanity checks here, e.g. so
+				 * that oldest is not wrapped around w.r.t. nextMulti.
+				 */
+				if (set_oldestmxid == 0)
+				{
+					fprintf(stderr, _("%s: oldest multitransaction ID (-m) must not be 0\n"),
+							progname);
+					exit(1);
+				}
 				break;
 
 			case 'O':
@@ -307,8 +327,15 @@ main(int argc, char *argv[])
 		ControlFile.checkPointCopy.nextOid = set_oid;
 
 	if (set_mxid != 0)
+	{
 		ControlFile.checkPointCopy.nextMulti = set_mxid;
 
+		ControlFile.checkPointCopy.oldestMulti = set_oldestmxid;
+		if (ControlFile.checkPointCopy.oldestMulti < FirstMultiXactId)
+			ControlFile.checkPointCopy.oldestMulti += FirstMultiXactId;
+		ControlFile.checkPointCopy.oldestMultiDB = InvalidOid;
+	}
+
 	if (set_mxoff != -1)
 		ControlFile.checkPointCopy.nextMultiOffset = set_mxoff;
 
@@ -471,6 +498,8 @@ GuessControlValues(void)
 	ControlFile.checkPointCopy.nextMultiOffset = 0;
 	ControlFile.checkPointCopy.oldestXid = FirstNormalTransactionId;
 	ControlFile.checkPointCopy.oldestXidDB = InvalidOid;
+	ControlFile.checkPointCopy.oldestMulti = FirstMultiXactId;
+	ControlFile.checkPointCopy.oldestMultiDB = InvalidOid;
 	ControlFile.checkPointCopy.time = (pg_time_t) time(NULL);
 	ControlFile.checkPointCopy.oldestActiveXid = InvalidTransactionId;
 
@@ -562,6 +591,10 @@ PrintControlValues(bool guessed)
 		   ControlFile.checkPointCopy.oldestXidDB);
 	printf(_("Latest checkpoint's oldestActiveXID:  %u\n"),
 		   ControlFile.checkPointCopy.oldestActiveXid);
+	printf(_("Latest checkpoint's oldestMultiXid:   %u\n"),
+		   ControlFile.checkPointCopy.oldestMulti);
+	printf(_("Latest checkpoint's oldestMulti's DB: %u\n"),
+		   ControlFile.checkPointCopy.oldestMultiDB);
 	printf(_("Maximum data alignment:               %u\n"),
 		   ControlFile.maxAlign);
 	/* we don't print floatFormat since can't say much useful about it */
@@ -994,7 +1027,7 @@ usage(void)
 	printf(_("  -e XIDEPOCH      set next transaction ID epoch\n"));
 	printf(_("  -f               force update to be done\n"));
 	printf(_("  -l xlogfile      force minimum WAL starting location for new transaction log\n"));
-	printf(_("  -m XID           set next multitransaction ID\n"));
+	printf(_("  -m XID,OLDEST    set next multitransaction ID and oldest value\n"));
 	printf(_("  -n               no update, just show extracted control values (for testing)\n"));
 	printf(_("  -o OID           set next OID\n"));
 	printf(_("  -O OFFSET        set next multitransaction offset\n"));
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index c737b3ff28..af9e506d2b 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -30,12 +30,23 @@
 
 typedef struct BulkInsertStateData *BulkInsertState;
 
-typedef enum
+/*
+ * Possible lock modes for a tuple.
+ */
+typedef enum LockTupleMode
 {
-	LockTupleShared,
+	/* SELECT FOR KEY SHARE */
+	LockTupleKeyShare,
+	/* SELECT FOR SHARE */
+	LockTupleShare,
+	/* SELECT FOR NO KEY UPDATE, and UPDATEs that don't modify key columns */
+	LockTupleNoKeyExclusive,
+	/* SELECT FOR UPDATE, UPDATEs that modify key columns, and DELETE */
 	LockTupleExclusive
 } LockTupleMode;
 
+#define MaxLockTupleMode	LockTupleExclusive
+
 /*
  * When heap_update, heap_delete, or heap_lock_tuple fail because the target
  * tuple is already outdated, they fill in this struct to provide information
@@ -129,14 +140,16 @@ extern HTSU_Result heap_delete(Relation relation, ItemPointer tid,
 extern HTSU_Result heap_update(Relation relation, ItemPointer otid,
 			HeapTuple newtup,
 			CommandId cid, Snapshot crosscheck, bool wait,
-			HeapUpdateFailureData *hufd);
+			HeapUpdateFailureData *hufd, LockTupleMode *lockmode);
 extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple,
 				CommandId cid, LockTupleMode mode, bool nowait,
+				bool follow_update,
 				Buffer *buffer, HeapUpdateFailureData *hufd);
 extern void heap_inplace_update(Relation relation, HeapTuple tuple);
-extern bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid);
+extern bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
+				  TransactionId cutoff_multi);
 extern bool heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
-						Buffer buf);
+						MultiXactId cutoff_multi, Buffer buf);
 
 extern Oid	simple_heap_insert(Relation relation, HeapTuple tup);
 extern void simple_heap_delete(Relation relation, ItemPointer tid);
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index 9db6953720..270924a01a 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -54,6 +54,7 @@
 #define XLOG_HEAP2_CLEANUP_INFO 0x30
 #define XLOG_HEAP2_VISIBLE		0x40
 #define XLOG_HEAP2_MULTI_INSERT 0x50
+#define XLOG_HEAP2_LOCK_UPDATED 0x60
 
 /*
  * All what we need to find changed tuple
@@ -75,6 +76,8 @@ typedef struct xl_heaptid
 typedef struct xl_heap_delete
 {
 	xl_heaptid	target;			/* deleted tuple id */
+	TransactionId xmax;			/* xmax of the deleted tuple */
+	uint8		infobits_set;	/* infomask bits */
 	bool		all_visible_cleared;	/* PD_ALL_VISIBLE was cleared */
 } xl_heap_delete;
 
@@ -141,7 +144,10 @@ typedef struct xl_multi_insert_tuple
 typedef struct xl_heap_update
 {
 	xl_heaptid	target;			/* deleted tuple id */
+	TransactionId old_xmax;		/* xmax of the old tuple */
+	TransactionId new_xmax;		/* xmax of the new tuple */
 	ItemPointerData newtid;		/* new inserted tuple id */
+	uint8		old_infobits_set;	/* infomask bits to set on old tuple */
 	bool		all_visible_cleared;	/* PD_ALL_VISIBLE was cleared */
 	bool		new_all_visible_cleared;		/* same for the page of newtid */
 	/* NEW TUPLE xl_heap_header AND TUPLE DATA FOLLOWS AT END OF STRUCT */
@@ -197,16 +203,32 @@ typedef struct xl_heap_newpage
 
 #define SizeOfHeapNewpage	(offsetof(xl_heap_newpage, blkno) + sizeof(BlockNumber))
 
+/* flags for infobits_set */
+#define XLHL_XMAX_IS_MULTI		0x01
+#define XLHL_XMAX_LOCK_ONLY		0x02
+#define XLHL_XMAX_EXCL_LOCK		0x04
+#define XLHL_XMAX_KEYSHR_LOCK	0x08
+#define XLHL_KEYS_UPDATED		0x10
+
 /* This is what we need to know about lock */
 typedef struct xl_heap_lock
 {
 	xl_heaptid	target;			/* locked tuple id */
 	TransactionId locking_xid;	/* might be a MultiXactId not xid */
-	bool		xid_is_mxact;	/* is it? */
-	bool		shared_lock;	/* shared or exclusive row lock? */
+	int8		infobits_set;	/* infomask and infomask2 bits to set */
 } xl_heap_lock;
 
-#define SizeOfHeapLock	(offsetof(xl_heap_lock, shared_lock) + sizeof(bool))
+#define SizeOfHeapLock	(offsetof(xl_heap_lock, infobits_set) + sizeof(int8))
+
+/* This is what we need to know about locking an updated version of a row */
+typedef struct xl_heap_lock_updated
+{
+	xl_heaptid	target;
+	TransactionId	xmax;
+	uint8		infobits_set;
+} xl_heap_lock_updated;
+
+#define SizeOfHeapLockUpdated	(offsetof(xl_heap_lock_updated, infobits_set) + sizeof(uint8))
 
 /* This is what we need to know about in-place update */
 typedef struct xl_heap_inplace
@@ -223,10 +245,11 @@ typedef struct xl_heap_freeze
 	RelFileNode node;
 	BlockNumber block;
 	TransactionId cutoff_xid;
+	MultiXactId cutoff_multi;
 	/* TUPLE OFFSET NUMBERS FOLLOW AT THE END */
 } xl_heap_freeze;
 
-#define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId))
+#define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_multi) + sizeof(MultiXactId))
 
 /* This is what we need to know about setting a visibility map bit */
 typedef struct xl_heap_visible
@@ -254,7 +277,7 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
 			   OffsetNumber *nowunused, int nunused,
 			   TransactionId latestRemovedXid);
 extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
-				TransactionId cutoff_xid,
+				TransactionId cutoff_xid, MultiXactId cutoff_multi,
 				OffsetNumber *offsets, int offcnt);
 extern XLogRecPtr log_heap_visible(RelFileNode rnode, BlockNumber block,
 				 Buffer vm_buffer, TransactionId cutoff_xid);
diff --git a/src/include/access/htup.h b/src/include/access/htup.h
index 9cd4b88ed4..79e3c50ef1 100644
--- a/src/include/access/htup.h
+++ b/src/include/access/htup.h
@@ -80,7 +80,9 @@ typedef HeapTupleData *HeapTuple;
 extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup);
 extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup);
 extern void HeapTupleHeaderAdjustCmax(HeapTupleHeader tup,
-						  CommandId *cmax,
-						  bool *iscombo);
+						  CommandId *cmax, bool *iscombo);
+
+/* Prototype for HeapTupleHeader accessors in heapam.c */
+extern TransactionId HeapTupleGetUpdateXid(HeapTupleHeader tuple);
 
 #endif   /* HTUP_H */
diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h
index aeab45bb97..6a28d8ed74 100644
--- a/src/include/access/htup_details.h
+++ b/src/include/access/htup_details.h
@@ -162,12 +162,16 @@ struct HeapTupleHeaderData
 #define HEAP_HASVARWIDTH		0x0002	/* has variable-width attribute(s) */
 #define HEAP_HASEXTERNAL		0x0004	/* has external stored attribute(s) */
 #define HEAP_HASOID				0x0008	/* has an object-id field */
-/* bit 0x0010 is available */
+#define HEAP_XMAX_KEYSHR_LOCK	0x0010	/* xmax is a key-shared locker */
 #define HEAP_COMBOCID			0x0020	/* t_cid is a combo cid */
 #define HEAP_XMAX_EXCL_LOCK		0x0040	/* xmax is exclusive locker */
-#define HEAP_XMAX_SHARED_LOCK	0x0080	/* xmax is shared locker */
-/* if either LOCK bit is set, xmax hasn't deleted the tuple, only locked it */
-#define HEAP_IS_LOCKED	(HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_SHARED_LOCK)
+#define HEAP_XMAX_LOCK_ONLY		0x0080	/* xmax, if valid, is only a locker */
+
+										/* xmax is a shared locker */
+#define HEAP_XMAX_SHR_LOCK	(HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_KEYSHR_LOCK)
+
+#define HEAP_LOCK_MASK	(HEAP_XMAX_SHR_LOCK | HEAP_XMAX_EXCL_LOCK | \
+						 HEAP_XMAX_KEYSHR_LOCK)
 #define HEAP_XMIN_COMMITTED		0x0100	/* t_xmin committed */
 #define HEAP_XMIN_INVALID		0x0200	/* t_xmin invalid/aborted */
 #define HEAP_XMAX_COMMITTED		0x0400	/* t_xmax committed */
@@ -182,17 +186,42 @@ struct HeapTupleHeaderData
 										 * upgrade support */
 #define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN)
 
-#define HEAP_XACT_MASK			0xFFE0	/* visibility-related bits */
+#define HEAP_XACT_MASK			0xFFF0	/* visibility-related bits */
+
+/*
+ * A tuple is only locked (i.e. not updated by its Xmax) if it the
+ * HEAP_XMAX_LOCK_ONLY bit is set.
+ *
+ * See also HeapTupleHeaderIsOnlyLocked, which also checks for a possible
+ * aborted updater transaction.
+ */
+#define HEAP_XMAX_IS_LOCKED_ONLY(infomask) \
+	((infomask) & HEAP_XMAX_LOCK_ONLY)
+/*
+ * Use these to test whether a particular lock is applied to a tuple
+ */
+#define HEAP_XMAX_IS_SHR_LOCKED(infomask) \
+    (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_SHR_LOCK)
+#define HEAP_XMAX_IS_EXCL_LOCKED(infomask) \
+    (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_EXCL_LOCK)
+#define HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) \
+    (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_KEYSHR_LOCK)
+
+/* turn these all off when Xmax is to change */
+#define HEAP_XMAX_BITS (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | \
+						HEAP_XMAX_IS_MULTI | HEAP_LOCK_MASK | HEAP_XMAX_LOCK_ONLY)
 
 /*
  * information stored in t_infomask2:
  */
 #define HEAP_NATTS_MASK			0x07FF	/* 11 bits for number of attributes */
-/* bits 0x3800 are available */
+/* bits 0x1800 are available */
+#define HEAP_KEYS_UPDATED		0x2000	/* tuple was updated and key cols
+										 * modified, or tuple deleted */
 #define HEAP_HOT_UPDATED		0x4000	/* tuple was HOT-updated */
 #define HEAP_ONLY_TUPLE			0x8000	/* this is heap-only tuple */
 
-#define HEAP2_XACT_MASK			0xC000	/* visibility-related bits */
+#define HEAP2_XACT_MASK			0xE000	/* visibility-related bits */
 
 /*
  * HEAP_TUPLE_HAS_MATCH is a temporary flag used during hash joins.  It is
@@ -219,7 +248,24 @@ struct HeapTupleHeaderData
 	(tup)->t_choice.t_heap.t_xmin = (xid) \
 )
 
-#define HeapTupleHeaderGetXmax(tup) \
+/*
+ * HeapTupleHeaderGetRawXmax gets you the raw Xmax field.  To find out the Xid
+ * that updated a tuple, you might need to resolve the MultiXactId if certain
+ * bits are set.  HeapTupleHeaderGetUpdateXid checks those bits and takes care
+ * to resolve the MultiXactId if necessary.  This might involve multixact I/O,
+ * so it should only be used if absolutely necessary.
+ */
+#define HeapTupleHeaderGetUpdateXid(tup) \
+( \
+	(!((tup)->t_infomask & HEAP_XMAX_INVALID) && \
+	 ((tup)->t_infomask & HEAP_XMAX_IS_MULTI) && \
+	 !((tup)->t_infomask & HEAP_XMAX_LOCK_ONLY)) ? \
+		HeapTupleGetUpdateXid(tup) \
+	: \
+		HeapTupleHeaderGetRawXmax(tup) \
+)
+
+#define HeapTupleHeaderGetRawXmax(tup) \
 ( \
 	(tup)->t_choice.t_heap.t_xmax \
 )
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index b5486bec09..b08bb1f49a 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -13,8 +13,15 @@
 
 #include "access/xlog.h"
 
+
+/*
+ * The first two MultiXactId values are reserved to store the truncation Xid
+ * and epoch of the first segment, so we start assigning multixact values from
+ * 2.
+ */
 #define InvalidMultiXactId	((MultiXactId) 0)
 #define FirstMultiXactId	((MultiXactId) 1)
+#define MaxMultiXactId		((MultiXactId) 0xFFFFFFFF)
 
 #define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId)
 
@@ -22,6 +29,33 @@
 #define NUM_MXACTOFFSET_BUFFERS		8
 #define NUM_MXACTMEMBER_BUFFERS		16
 
+/*
+ * Possible multixact lock modes ("status").  The first four modes are for
+ * tuple locks (FOR KEY SHARE, FOR SHARE, FOR NO KEY UPDATE, FOR UPDATE); the
+ * next two are used for update and delete modes.
+ */
+typedef enum
+{
+	MultiXactStatusForKeyShare = 0x00,
+	MultiXactStatusForShare = 0x01,
+	MultiXactStatusForNoKeyUpdate = 0x02,
+	MultiXactStatusForUpdate = 0x03,
+	/* an update that doesn't touch "key" columns */
+	MultiXactStatusNoKeyUpdate = 0x04,
+	/* other updates, and delete */
+	MultiXactStatusUpdate = 0x05
+} MultiXactStatus;
+
+#define MaxMultiXactStatus MultiXactStatusUpdate
+
+
+typedef struct MultiXactMember
+{
+	TransactionId	xid;
+	MultiXactStatus	status;
+} MultiXactMember;
+
+
 /* ----------------
  *		multixact-related XLOG entries
  * ----------------
@@ -35,21 +69,24 @@ typedef struct xl_multixact_create
 {
 	MultiXactId mid;			/* new MultiXact's ID */
 	MultiXactOffset moff;		/* its starting offset in members file */
-	int32		nxids;			/* number of member XIDs */
-	TransactionId xids[1];		/* VARIABLE LENGTH ARRAY */
+	int32		nmembers;		/* number of member XIDs */
+	MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
 } xl_multixact_create;
 
-#define MinSizeOfMultiXactCreate offsetof(xl_multixact_create, xids)
+#define SizeOfMultiXactCreate (offsetof(xl_multixact_create, members))
 
 
-extern MultiXactId MultiXactIdCreate(TransactionId xid1, TransactionId xid2);
-extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid);
+extern MultiXactId MultiXactIdCreate(TransactionId xid1,
+				  MultiXactStatus status1, TransactionId xid2,
+				  MultiXactStatus status2);
+extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid,
+				  MultiXactStatus status);
+extern MultiXactId ReadNextMultiXactId(void);
 extern bool MultiXactIdIsRunning(MultiXactId multi);
-extern bool MultiXactIdIsCurrent(MultiXactId multi);
-extern void MultiXactIdWait(MultiXactId multi);
-extern bool ConditionalMultiXactIdWait(MultiXactId multi);
 extern void MultiXactIdSetOldestMember(void);
-extern int	GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids);
+extern int	GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **xids,
+					  bool allow_old);
+extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2);
 
 extern void AtEOXact_MultiXact(void);
 extern void AtPrepare_MultiXact(void);
@@ -60,14 +97,21 @@ extern void MultiXactShmemInit(void);
 extern void BootStrapMultiXact(void);
 extern void StartupMultiXact(void);
 extern void ShutdownMultiXact(void);
+extern void SetMultiXactIdLimit(MultiXactId oldest_datminmxid,
+					Oid oldest_datoid);
 extern void MultiXactGetCheckptMulti(bool is_shutdown,
 						 MultiXactId *nextMulti,
-						 MultiXactOffset *nextMultiOffset);
+						 MultiXactOffset *nextMultiOffset,
+						 MultiXactId *oldestMulti,
+						 Oid *oldestMultiDB);
 extern void CheckPointMultiXact(void);
+extern MultiXactId GetOldestMultiXactId(void);
+extern void TruncateMultiXact(MultiXactId cutoff_multi);
 extern void MultiXactSetNextMXact(MultiXactId nextMulti,
 					  MultiXactOffset nextMultiOffset);
 extern void MultiXactAdvanceNextMXact(MultiXactId minMulti,
 						  MultiXactOffset minMultiOffset);
+extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB);
 
 extern void multixact_twophase_recover(TransactionId xid, uint16 info,
 						   void *recdata, uint32 len);
@@ -78,5 +122,7 @@ extern void multixact_twophase_postabort(TransactionId xid, uint16 info,
 
 extern void multixact_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void multixact_desc(StringInfo buf, uint8 xl_info, char *rec);
+extern char *mxid_to_string(MultiXactId multi, int nmembers,
+			   MultiXactMember *members);
 
 #endif   /* MULTIXACT_H */
diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h
index 13b991a8b1..f82d1f5734 100644
--- a/src/include/access/rewriteheap.h
+++ b/src/include/access/rewriteheap.h
@@ -21,7 +21,7 @@ typedef struct RewriteStateData *RewriteState;
 
 extern RewriteState begin_heap_rewrite(Relation NewHeap,
 				   TransactionId OldestXmin, TransactionId FreezeXid,
-				   bool use_wal);
+				   MultiXactId MultiXactFrzLimit, bool use_wal);
 extern void end_heap_rewrite(RewriteState state);
 extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple,
 				   HeapTuple newTuple);
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index a676793566..4b8fa0175b 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201301211
+#define CATALOG_VERSION_NO	201301231
 
 #endif
diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h
index fcc293899a..820552f013 100644
--- a/src/include/catalog/pg_class.h
+++ b/src/include/catalog/pg_class.h
@@ -67,6 +67,8 @@ CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83) BKI_SCHEMA_MACRO
 	bool		relhastriggers; /* has (or has had) any TRIGGERs */
 	bool		relhassubclass; /* has (or has had) derived classes */
 	TransactionId relfrozenxid; /* all Xids < this are frozen in this rel */
+	TransactionId relminmxid;	/* all multixacts in this rel are >= this.
+								 * this is really a MultiXactId */
 
 #ifdef CATALOG_VARLEN			/* variable-length fields start here */
 	/* NOTE: These fields are not present in a relcache entry's rd_rel field. */
@@ -77,7 +79,7 @@ CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83) BKI_SCHEMA_MACRO
 
 /* Size of fixed part of pg_class tuples, not counting var-length fields */
 #define CLASS_TUPLE_SIZE \
-	 (offsetof(FormData_pg_class,relfrozenxid) + sizeof(TransactionId))
+	 (offsetof(FormData_pg_class,relminmxid) + sizeof(TransactionId))
 
 /* ----------------
  *		Form_pg_class corresponds to a pointer to a tuple with
@@ -91,7 +93,7 @@ typedef FormData_pg_class *Form_pg_class;
  * ----------------
  */
 
-#define Natts_pg_class					27
+#define Natts_pg_class					28
 #define Anum_pg_class_relname			1
 #define Anum_pg_class_relnamespace		2
 #define Anum_pg_class_reltype			3
@@ -117,8 +119,9 @@ typedef FormData_pg_class *Form_pg_class;
 #define Anum_pg_class_relhastriggers	23
 #define Anum_pg_class_relhassubclass	24
 #define Anum_pg_class_relfrozenxid		25
-#define Anum_pg_class_relacl			26
-#define Anum_pg_class_reloptions		27
+#define Anum_pg_class_relminmxid		26
+#define Anum_pg_class_relacl			27
+#define Anum_pg_class_reloptions		28
 
 /* ----------------
  *		initial contents of pg_class
@@ -129,14 +132,17 @@ typedef FormData_pg_class *Form_pg_class;
  * ----------------
  */
 
-/* Note: "3" in the relfrozenxid column stands for FirstNormalTransactionId */
-DATA(insert OID = 1247 (  pg_type		PGNSP 71 0 PGUID 0 0 0 0 0 0 0 0 f f p r 30 0 t f f f f 3 _null_ _null_ ));
+/*
+ * Note: "3" in the relfrozenxid column stands for FirstNormalTransactionId;
+ * similarly, "1" in relminmxid stands for FirstMultiXactId
+ */
+DATA(insert OID = 1247 (  pg_type		PGNSP 71 0 PGUID 0 0 0 0 0 0 0 0 f f p r 30 0 t f f f f 3 1 _null_ _null_ ));
 DESCR("");
-DATA(insert OID = 1249 (  pg_attribute	PGNSP 75 0 PGUID 0 0 0 0 0 0 0 0 f f p r 21 0 f f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1249 (  pg_attribute	PGNSP 75 0 PGUID 0 0 0 0 0 0 0 0 f f p r 21 0 f f f f f 3 1 _null_ _null_ ));
 DESCR("");
-DATA(insert OID = 1255 (  pg_proc		PGNSP 81 0 PGUID 0 0 0 0 0 0 0 0 f f p r 27 0 t f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1255 (  pg_proc		PGNSP 81 0 PGUID 0 0 0 0 0 0 0 0 f f p r 27 0 t f f f f 3 1 _null_ _null_ ));
 DESCR("");
-DATA(insert OID = 1259 (  pg_class		PGNSP 83 0 PGUID 0 0 0 0 0 0 0 0 f f p r 27 0 t f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1259 (  pg_class		PGNSP 83 0 PGUID 0 0 0 0 0 0 0 0 f f p r 28 0 t f f f f 3 1 _null_ _null_ ));
 DESCR("");
 
 
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index ead3a6e4ba..e4a9abe7bc 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -21,7 +21,7 @@
 
 
 /* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION	932
+#define PG_CONTROL_VERSION	933
 
 /*
  * Body of CheckPoint XLOG records.  This is declared here because we keep
@@ -41,6 +41,8 @@ typedef struct CheckPoint
 	MultiXactOffset nextMultiOffset;	/* next free MultiXact offset */
 	TransactionId oldestXid;	/* cluster-wide minimum datfrozenxid */
 	Oid			oldestXidDB;	/* database with minimum datfrozenxid */
+	MultiXactId	oldestMulti;	/* cluster-wide minimum datminmxid */
+	Oid			oldestMultiDB;	/* database with minimum datminmxid */
 	pg_time_t	time;			/* time stamp of checkpoint */
 
 	/*
diff --git a/src/include/catalog/pg_database.h b/src/include/catalog/pg_database.h
index 4010959b02..baeddcd12a 100644
--- a/src/include/catalog/pg_database.h
+++ b/src/include/catalog/pg_database.h
@@ -41,6 +41,7 @@ CATALOG(pg_database,1262) BKI_SHARED_RELATION BKI_ROWTYPE_OID(1248) BKI_SCHEMA_M
 	int32		datconnlimit;	/* max connections allowed (-1=no limit) */
 	Oid			datlastsysoid;	/* highest OID to consider a system OID */
 	TransactionId datfrozenxid; /* all Xids < this are frozen in this DB */
+	TransactionId datminmxid;	/* all multixacts in the DB are >= this */
 	Oid			dattablespace;	/* default table space for this DB */
 
 #ifdef CATALOG_VARLEN			/* variable-length fields start here */
@@ -59,7 +60,7 @@ typedef FormData_pg_database *Form_pg_database;
  *		compiler constants for pg_database
  * ----------------
  */
-#define Natts_pg_database				12
+#define Natts_pg_database				13
 #define Anum_pg_database_datname		1
 #define Anum_pg_database_datdba			2
 #define Anum_pg_database_encoding		3
@@ -70,10 +71,11 @@ typedef FormData_pg_database *Form_pg_database;
 #define Anum_pg_database_datconnlimit	8
 #define Anum_pg_database_datlastsysoid	9
 #define Anum_pg_database_datfrozenxid	10
-#define Anum_pg_database_dattablespace	11
-#define Anum_pg_database_datacl			12
+#define Anum_pg_database_datminmxid		11
+#define Anum_pg_database_dattablespace	12
+#define Anum_pg_database_datacl			13
 
-DATA(insert OID = 1 (  template1 PGUID ENCODING "LC_COLLATE" "LC_CTYPE" t t -1 0 0 1663 _null_));
+DATA(insert OID = 1 (  template1 PGUID ENCODING "LC_COLLATE" "LC_CTYPE" t t -1 0 0 1 1663 _null_));
 SHDESCR("default template for new databases");
 #define TemplateDbOid			1
 
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 010605d774..028e1684ff 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2909,6 +2909,8 @@ DATA(insert OID = 1371 (  pg_lock_status   PGNSP PGUID 12 1 1000 0 0 f f f f t t
 DESCR("view system lock information");
 DATA(insert OID = 1065 (  pg_prepared_xact PGNSP PGUID 12 1 1000 0 0 f f f f t t v 0 0 2249 "" "{28,25,1184,26,26}" "{o,o,o,o,o}" "{transaction,gid,prepared,ownerid,dbid}" _null_ pg_prepared_xact _null_ _null_ _null_ ));
 DESCR("view two-phase transactions");
+DATA(insert OID = 3819 (  pg_get_multixact_members PGNSP PGUID 12 1 1000 0 0 f f f f t t v 1 0 2249 "28" "{28,28,25}" "{i,o,o}" "{multixid,xid,mode}" _null_ pg_get_multixact_members _null_ _null_ _null_ ));
+DESCR("view members of a multixactid");
 
 DATA(insert OID = 3537 (  pg_describe_object		PGNSP PGUID 12 1 0 0 0 f f f f t f s 3 0 25 "26 26 23" _null_ _null_ _null_ _null_ pg_describe_object _null_ _null_ _null_ ));
 DESCR("get identification of SQL object");
diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h
index 532c31c11b..73c701fe53 100644
--- a/src/include/commands/cluster.h
+++ b/src/include/commands/cluster.h
@@ -30,6 +30,7 @@ extern void finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
 				 bool is_system_catalog,
 				 bool swap_toast_by_content,
 				 bool check_constraints,
-				 TransactionId frozenXid);
+				 TransactionId frozenXid,
+				 MultiXactId frozenMulti);
 
 #endif   /* CLUSTER_H */
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index f70442af4a..d8dd8b04ed 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -153,12 +153,14 @@ extern void vac_update_relstats(Relation relation,
 					double num_tuples,
 					BlockNumber num_all_visible_pages,
 					bool hasindex,
-					TransactionId frozenxid);
+					TransactionId frozenxid,
+					MultiXactId minmulti);
 extern void vacuum_set_xid_limits(int freeze_min_age, int freeze_table_age,
 					  bool sharedRel,
 					  TransactionId *oldestXmin,
 					  TransactionId *freezeLimit,
-					  TransactionId *freezeTableLimit);
+					  TransactionId *freezeTableLimit,
+					  MultiXactId *multiXactFrzLimit);
 extern void vac_update_datfrozenxid(void);
 extern void vacuum_delay_point(void);
 
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 5046893866..b1213a0635 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -193,7 +193,7 @@ extern void ExecConstraints(ResultRelInfo *resultRelInfo,
 extern ExecRowMark *ExecFindRowMark(EState *estate, Index rti);
 extern ExecAuxRowMark *ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist);
 extern TupleTableSlot *EvalPlanQual(EState *estate, EPQState *epqstate,
-			 Relation relation, Index rti,
+			 Relation relation, Index rti, int lockmode,
 			 ItemPointer tid, TransactionId priorXmax);
 extern HeapTuple EvalPlanQualFetch(EState *estate, Relation relation,
 				  int lockmode, ItemPointer tid, TransactionId priorXmax);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index b23989e19f..76e8cdb1ad 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -403,9 +403,9 @@ typedef struct EState
 
 /*
  * ExecRowMark -
- *	   runtime representation of FOR UPDATE/SHARE clauses
+ *	   runtime representation of FOR [KEY] UPDATE/SHARE clauses
  *
- * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we should have an
+ * When doing UPDATE, DELETE, or SELECT FOR [KEY] UPDATE/SHARE, we should have an
  * ExecRowMark for each non-target relation in the query (except inheritance
  * parent RTEs, which can be ignored at runtime).  See PlanRowMark for details
  * about most of the fields.  In addition to fields directly derived from
@@ -426,7 +426,7 @@ typedef struct ExecRowMark
 
 /*
  * ExecAuxRowMark -
- *	   additional runtime representation of FOR UPDATE/SHARE clauses
+ *	   additional runtime representation of FOR [KEY] UPDATE/SHARE clauses
  *
  * Each LockRows and ModifyTable node keeps a list of the rowmarks it needs to
  * deal with.  In addition to a pointer to the related entry in es_rowMarks,
@@ -1824,7 +1824,7 @@ typedef struct SetOpState
 /* ----------------
  *	 LockRowsState information
  *
- *		LockRows nodes are used to enforce FOR UPDATE/FOR SHARE locking.
+ *		LockRows nodes are used to enforce FOR [KEY] UPDATE/SHARE locking.
  * ----------------
  */
 typedef struct LockRowsState
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 56cf592e0c..d8678e5b3f 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -74,7 +74,7 @@ typedef uint32 AclMode;			/* a bitmask of privilege bits */
 #define ACL_CONNECT		(1<<11) /* for databases */
 #define N_ACL_RIGHTS	12		/* 1 plus the last 1<<x */
 #define ACL_NO_RIGHTS	0
-/* Currently, SELECT ... FOR UPDATE/FOR SHARE requires UPDATE privileges */
+/* Currently, SELECT ... FOR [KEY] UPDATE/SHARE requires UPDATE privileges */
 #define ACL_SELECT_FOR_UPDATE	ACL_UPDATE
 
 
@@ -119,7 +119,7 @@ typedef struct Query
 	bool		hasDistinctOn;	/* distinctClause is from DISTINCT ON */
 	bool		hasRecursive;	/* WITH RECURSIVE was specified */
 	bool		hasModifyingCTE;	/* has INSERT/UPDATE/DELETE in WITH */
-	bool		hasForUpdate;	/* FOR UPDATE or FOR SHARE was specified */
+	bool		hasForUpdate;	/* FOR [KEY] UPDATE/SHARE was specified */
 
 	List	   *cteList;		/* WITH list (of CommonTableExpr's) */
 
@@ -572,18 +572,28 @@ typedef struct DefElem
 } DefElem;
 
 /*
- * LockingClause - raw representation of FOR UPDATE/SHARE options
+ * LockingClause - raw representation of FOR [NO KEY] UPDATE/[KEY] SHARE
+ * 		options
  *
  * Note: lockedRels == NIL means "all relations in query".	Otherwise it
  * is a list of RangeVar nodes.  (We use RangeVar mainly because it carries
  * a location field --- currently, parse analysis insists on unqualified
  * names in LockingClause.)
  */
+typedef enum LockClauseStrength
+{
+	/* order is important -- see applyLockingClause */
+	LCS_FORKEYSHARE,
+	LCS_FORSHARE,
+	LCS_FORNOKEYUPDATE,
+	LCS_FORUPDATE
+} LockClauseStrength;
+
 typedef struct LockingClause
 {
 	NodeTag		type;
-	List	   *lockedRels;		/* FOR UPDATE or FOR SHARE relations */
-	bool		forUpdate;		/* true = FOR UPDATE, false = FOR SHARE */
+	List	   *lockedRels;		/* FOR [KEY] UPDATE/SHARE relations */
+	LockClauseStrength strength;
 	bool		noWait;			/* NOWAIT option */
 } LockingClause;
 
@@ -865,21 +875,21 @@ typedef struct WindowClause
 
 /*
  * RowMarkClause -
- *	   parser output representation of FOR UPDATE/SHARE clauses
+ *	   parser output representation of FOR [KEY] UPDATE/SHARE clauses
  *
  * Query.rowMarks contains a separate RowMarkClause node for each relation
- * identified as a FOR UPDATE/SHARE target.  If FOR UPDATE/SHARE is applied
- * to a subquery, we generate RowMarkClauses for all normal and subquery rels
- * in the subquery, but they are marked pushedDown = true to distinguish them
- * from clauses that were explicitly written at this query level.  Also,
- * Query.hasForUpdate tells whether there were explicit FOR UPDATE/SHARE
- * clauses in the current query level.
+ * identified as a FOR [KEY] UPDATE/SHARE target.  If one of these clauses
+ * is applied to a subquery, we generate RowMarkClauses for all normal and
+ * subquery rels in the subquery, but they are marked pushedDown = true to
+ * distinguish them from clauses that were explicitly written at this query
+ * level.  Also, Query.hasForUpdate tells whether there were explicit FOR
+ * UPDATE/SHARE/KEY SHARE clauses in the current query level.
  */
 typedef struct RowMarkClause
 {
 	NodeTag		type;
 	Index		rti;			/* range table index of target relation */
-	bool		forUpdate;		/* true = FOR UPDATE, false = FOR SHARE */
+	LockClauseStrength strength;
 	bool		noWait;			/* NOWAIT option */
 	bool		pushedDown;		/* pushed down from higher query level? */
 } RowMarkClause;
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 41c5e92034..0b8b1076bb 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -752,7 +752,7 @@ typedef struct Limit
  * RowMarkType -
  *	  enums for types of row-marking operations
  *
- * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we have to uniquely
+ * When doing UPDATE, DELETE, or SELECT FOR [KEY] UPDATE/SHARE, we have to uniquely
  * identify all the source rows, not only those from the target relations, so
  * that we can perform EvalPlanQual rechecking at need.  For plain tables we
  * can just fetch the TID, the same as for a target relation.  Otherwise (for
@@ -763,20 +763,22 @@ typedef struct Limit
 typedef enum RowMarkType
 {
 	ROW_MARK_EXCLUSIVE,			/* obtain exclusive tuple lock */
+	ROW_MARK_NOKEYEXCLUSIVE,	/* obtain no-key exclusive tuple lock */
 	ROW_MARK_SHARE,				/* obtain shared tuple lock */
+	ROW_MARK_KEYSHARE,			/* obtain keyshare tuple lock */
 	ROW_MARK_REFERENCE,			/* just fetch the TID */
 	ROW_MARK_COPY				/* physically copy the row value */
 } RowMarkType;
 
-#define RowMarkRequiresRowShareLock(marktype)  ((marktype) <= ROW_MARK_SHARE)
+#define RowMarkRequiresRowShareLock(marktype)  ((marktype) <= ROW_MARK_KEYSHARE)
 
 /*
  * PlanRowMark -
- *	   plan-time representation of FOR UPDATE/SHARE clauses
+ *	   plan-time representation of FOR [KEY] UPDATE/SHARE clauses
  *
- * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we create a separate
+ * When doing UPDATE, DELETE, or SELECT FOR [KEY] UPDATE/SHARE, we create a separate
  * PlanRowMark node for each non-target relation in the query.	Relations that
- * are not specified as FOR UPDATE/SHARE are marked ROW_MARK_REFERENCE (if
+ * are not specified as FOR [KEY] UPDATE/SHARE are marked ROW_MARK_REFERENCE (if
  * real tables) or ROW_MARK_COPY (if not).
  *
  * Initially all PlanRowMarks have rti == prti and isParent == false.
diff --git a/src/include/parser/analyze.h b/src/include/parser/analyze.h
index fc45153f36..2f988d4021 100644
--- a/src/include/parser/analyze.h
+++ b/src/include/parser/analyze.h
@@ -38,6 +38,6 @@ extern bool analyze_requires_snapshot(Node *parseTree);
 
 extern void CheckSelectLocking(Query *qry);
 extern void applyLockingClause(Query *qry, Index rtindex,
-				   bool forUpdate, bool noWait, bool pushedDown);
+				   LockClauseStrength strength, bool noWait, bool pushedDown);
 
 #endif   /* ANALYZE_H */
diff --git a/src/include/postgres.h b/src/include/postgres.h
index b6e922f358..8ff107a7b3 100644
--- a/src/include/postgres.h
+++ b/src/include/postgres.h
@@ -456,6 +456,13 @@ typedef Datum *DatumPtr;
 
 #define TransactionIdGetDatum(X) ((Datum) SET_4_BYTES((X)))
 
+/*
+ * MultiXactIdGetDatum
+ * 		Returns datum representation for a multixact identifier.
+ */
+
+#define MultiXactIdGetDatum(X) ((Datum) SET_4_BYTES((X)))
+
 /*
  * DatumGetCommandId
  *		Returns command identifier value of a datum.
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h
index c8974c9ac5..f10c8f194f 100644
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -478,6 +478,7 @@ typedef enum
 extern void InitLocks(void);
 extern LockMethod GetLocksMethodTable(const LOCK *lock);
 extern uint32 LockTagHashCode(const LOCKTAG *locktag);
+extern bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2);
 extern LockAcquireResult LockAcquire(const LOCKTAG *locktag,
 			LOCKMODE lockmode,
 			bool sessionLock,
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 61d6aef2ed..ad4d68cd50 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -1134,6 +1134,9 @@ extern Datum ginarrayconsistent(PG_FUNCTION_ARGS);
 /* access/transam/twophase.c */
 extern Datum pg_prepared_xact(PG_FUNCTION_ARGS);
 
+/* access/transam/multixact.c */
+extern Datum pg_get_multixact_members(PG_FUNCTION_ARGS);
+
 /* catalogs/dependency.c */
 extern Datum pg_describe_object(PG_FUNCTION_ARGS);
 
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index bde5f1738e..c342eaa66f 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -114,6 +114,7 @@ typedef struct RelationData
 	Oid			rd_id;			/* relation's object id */
 	List	   *rd_indexlist;	/* list of OIDs of indexes on relation */
 	Bitmapset  *rd_indexattr;	/* identifies columns used in indexes */
+	Bitmapset  *rd_keyattr;		/* cols that can be ref'd by foreign keys */
 	Oid			rd_oidindex;	/* OID of unique index on OID, if any */
 	LockInfoData rd_lockInfo;	/* lock mgr's info for locking relation */
 	RuleLock   *rd_rules;		/* rewrite rules */
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index 444fad3460..1ec2683eac 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -41,7 +41,7 @@ extern List *RelationGetIndexList(Relation relation);
 extern Oid	RelationGetOidIndex(Relation relation);
 extern List *RelationGetIndexExpressions(Relation relation);
 extern List *RelationGetIndexPredicate(Relation relation);
-extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation);
+extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs);
 extern void RelationGetExclusionInfo(Relation indexRelation,
 						 Oid **operators,
 						 Oid **procs,
@@ -77,7 +77,7 @@ extern Relation RelationBuildLocalRelation(const char *relname,
  * Routine to manage assignment of new relfilenode to a relation
  */
 extern void RelationSetNewRelfilenode(Relation relation,
-						  TransactionId freezeXid);
+						  TransactionId freezeXid, MultiXactId minmulti);
 
 /*
  * Routines for flushing/rebuilding relcache entries in various scenarios
diff --git a/src/include/utils/tqual.h b/src/include/utils/tqual.h
index 72a8ea42e5..465231c758 100644
--- a/src/include/utils/tqual.h
+++ b/src/include/utils/tqual.h
@@ -88,5 +88,6 @@ extern bool HeapTupleIsSurelyDead(HeapTupleHeader tuple,
 
 extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer,
 					 uint16 infomask, TransactionId xid);
+extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple);
 
 #endif   /* TQUAL_H */
diff --git a/src/test/isolation/expected/aborted-keyrevoke.out b/src/test/isolation/expected/aborted-keyrevoke.out
new file mode 100644
index 0000000000..8850614b8a
--- /dev/null
+++ b/src/test/isolation/expected/aborted-keyrevoke.out
@@ -0,0 +1,276 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1s s1u s1r s1l s1c s2l s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s1r s1l s2l s1c s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s1r s1l s2l s2c s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s1r s2l s1l s1c s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s1r s2l s1l s2c s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s1r s2l s2c s1l s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s2l s1r s1l s1c s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s2l s1r s1l s2c s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s2l s1r s2c s1l s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s2l s2c s1r s1l s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s1r s1l s1c s2c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s1r s1l s2c s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s1r s2c s1l s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s2c s1r s1l s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1s s2l s2c s1u s1r s1l s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s2l s1s s1u s1r s1l s1c s2c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1s s1u s1r s1l s2c s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1s s1u s1r s2c s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1s s1u s2c s1r s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s2l s1s s2c s1u s1r s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s2c: COMMIT;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s2l s2c s1s s1u s1r s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/aborted-keyrevoke_2.out b/src/test/isolation/expected/aborted-keyrevoke_2.out
new file mode 100644
index 0000000000..85f6ccb63e
--- /dev/null
+++ b/src/test/isolation/expected/aborted-keyrevoke_2.out
@@ -0,0 +1,278 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1s s1u s1r s1l s1c s2l s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s1r s1l s2l s1c s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s1r s1l s2l s2c s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s1r s2l s1l s1c s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s1r s2l s1l s2c s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s1r s2l s2c s1l s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s2l s1r s1l s1c s2c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1s s1u s2l s1r s1l s2c s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s2l s1r s2c s1l s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1s s1u s2l s2c s1r s1l s1c
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s2l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s1r s1l s1c s2c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s1r s1l s2c s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s1r s2c s1l s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1s s2l s1u s2c s1r s1l s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1s s2l s2c s1u s1r s1l s1c
+step s1s: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s2l s1s s1u s1r s1l s1c s2c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1s s1u s1r s1l s2c s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1s s1u s1r s2c s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1s s1u s2c s1r s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s2l s1s s2c s1u s1r s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1s: SAVEPOINT f;
+step s2c: COMMIT;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s2l s2c s1s s1u s1r s1l s1c
+step s2l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1s: SAVEPOINT f;
+step s1u: UPDATE foo SET key = 2;
+step s1r: ROLLBACK TO f;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/delete-abort-savept-2.out b/src/test/isolation/expected/delete-abort-savept-2.out
new file mode 100644
index 0000000000..f66a90c6f0
--- /dev/null
+++ b/src/test/isolation/expected/delete-abort-savept-2.out
@@ -0,0 +1,76 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1l s1svp s1d s1r s2l s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: SELECT * FROM foo FOR NO KEY UPDATE;
+key            value          
+
+1              1              
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1c: COMMIT;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s1d s2l s1r s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: SELECT * FROM foo FOR NO KEY UPDATE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s1d s1r s2l2 s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: SELECT * FROM foo FOR NO KEY UPDATE;
+key            value          
+
+1              1              
+step s1r: ROLLBACK TO f;
+step s2l2: SELECT * FROM foo FOR NO KEY UPDATE;
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s1d s2l2 s1r s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: SELECT * FROM foo FOR NO KEY UPDATE;
+key            value          
+
+1              1              
+step s2l2: SELECT * FROM foo FOR NO KEY UPDATE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s2l2: <... completed>
+key            value          
+
+1              1              
+step s1c: COMMIT;
+step s2c: COMMIT;
diff --git a/src/test/isolation/expected/delete-abort-savept.out b/src/test/isolation/expected/delete-abort-savept.out
new file mode 100644
index 0000000000..3420cf47d7
--- /dev/null
+++ b/src/test/isolation/expected/delete-abort-savept.out
@@ -0,0 +1,243 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1l s1svp s1d s1r s1c s2l s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s1d s1r s2l s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1c: COMMIT;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s1d s1r s2l s2c s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1l s1svp s1d s2l s1r s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s1d s2l s1r s2c s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1r: ROLLBACK TO f;
+invalid permutation detected
+
+starting permutation: s1l s1svp s1d s2l s2c s1r s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1l s1svp s2l s1d s1r s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s1svp s2l s1d s1r s2c s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+invalid permutation detected
+
+starting permutation: s1l s1svp s2l s1d s2c s1r s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1d: DELETE FROM foo;
+invalid permutation detected
+
+starting permutation: s1l s1svp s2l s2c s1d s1r s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1l s2l s1svp s1d s1r s1c s2c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
+step s2l: <... completed>
+key            value          
+
+1              1              
+step s2c: COMMIT;
+
+starting permutation: s1l s2l s1svp s1d s1r s2c s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+invalid permutation detected
+
+starting permutation: s1l s2l s1svp s1d s2c s1r s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+invalid permutation detected
+
+starting permutation: s1l s2l s1svp s2c s1d s1r s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+step s1svp: SAVEPOINT f;
+invalid permutation detected
+
+starting permutation: s1l s2l s2c s1svp s1d s1r s1c
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2l: SELECT * FROM foo FOR UPDATE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1l s1svp s1d s1r s1c s2c
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1l s1svp s1d s1r s2c s1c
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1l s1svp s1d s2c s1r s1c
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1l s1svp s2c s1d s1r s1c
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2l s1l s2c s1svp s1d s1r s1c
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s2c: COMMIT;
+step s1l: <... completed>
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
+
+starting permutation: s2l s2c s1l s1svp s1d s1r s1c
+step s2l: SELECT * FROM foo FOR UPDATE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s1svp: SAVEPOINT f;
+step s1d: DELETE FROM foo;
+step s1r: ROLLBACK TO f;
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-contention.out b/src/test/isolation/expected/fk-contention.out
index 24ed72d427..0916f7f3d2 100644
--- a/src/test/isolation/expected/fk-contention.out
+++ b/src/test/isolation/expected/fk-contention.out
@@ -7,9 +7,8 @@ step upd: UPDATE foo SET b = 'Hello World';
 
 starting permutation: ins upd com
 step ins: INSERT INTO bar VALUES (42);
-step upd: UPDATE foo SET b = 'Hello World'; <waiting ...>
+step upd: UPDATE foo SET b = 'Hello World';
 step com: COMMIT;
-step upd: <... completed>
 
 starting permutation: upd ins com
 step upd: UPDATE foo SET b = 'Hello World';
diff --git a/src/test/isolation/expected/fk-deadlock.out b/src/test/isolation/expected/fk-deadlock.out
index 36813f11f5..69eac88c2b 100644
--- a/src/test/isolation/expected/fk-deadlock.out
+++ b/src/test/isolation/expected/fk-deadlock.out
@@ -11,57 +11,151 @@ step s2c: COMMIT;
 starting permutation: s1i s1u s2i s1c s2u s2c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s1u: UPDATE parent SET aux = 'bar';
-step s2i: INSERT INTO child VALUES (2, 1); <waiting ...>
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1c: COMMIT;
+step s2u: UPDATE parent SET aux = 'baz';
+step s2c: COMMIT;
+
+starting permutation: s1i s1u s2i s2u s1c s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1c: COMMIT;
+step s2u: <... completed>
+step s2c: COMMIT;
+
+starting permutation: s1i s1u s2i s2u s2c s1c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1i s2i s1u s1c s2u s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1u: UPDATE parent SET aux = 'bar';
 step s1c: COMMIT;
-step s2i: <... completed>
 step s2u: UPDATE parent SET aux = 'baz';
 step s2c: COMMIT;
 
 starting permutation: s1i s2i s1u s2u s1c s2c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s2i: INSERT INTO child VALUES (2, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1c: COMMIT;
+step s2u: <... completed>
+step s2c: COMMIT;
+
+starting permutation: s1i s2i s1u s2u s2c s1c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1i s2i s2u s1u s1c s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
 step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1i s2i s2u s1u s2c s1c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
 step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+step s2c: COMMIT;
 step s1u: <... completed>
-error in steps s2u s1u: ERROR:  deadlock detected
 step s1c: COMMIT;
+
+starting permutation: s1i s2i s2u s2c s1u s1c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
 step s2c: COMMIT;
+step s1u: UPDATE parent SET aux = 'bar';
+step s1c: COMMIT;
 
-starting permutation: s1i s2i s2u s1u s2c s1c
+starting permutation: s2i s1i s1u s1c s2u s2c
+step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s1c: COMMIT;
+step s2u: UPDATE parent SET aux = 'baz';
+step s2c: COMMIT;
+
+starting permutation: s2i s1i s1u s2u s1c s2c
 step s2i: INSERT INTO child VALUES (2, 1);
-step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1i: INSERT INTO child VALUES (1, 1);
 step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1c: COMMIT;
 step s2u: <... completed>
-error in steps s1u s2u: ERROR:  deadlock detected
 step s2c: COMMIT;
-step s1c: COMMIT;
 
-starting permutation: s2i s1i s1u s2u s1c s2c
+starting permutation: s2i s1i s1u s2u s2c s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2i s1i s2u s1u s1c s2c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2u: UPDATE parent SET aux = 'baz';
 step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2i s1i s2u s1u s2c s1c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1i: INSERT INTO child VALUES (1, 1);
 step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+step s2c: COMMIT;
 step s1u: <... completed>
-error in steps s2u s1u: ERROR:  deadlock detected
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s2i s1i s2u s1u s2c s1c
+starting permutation: s2i s1i s2u s2c s1u s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
-step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s2u: UPDATE parent SET aux = 'baz';
+step s2c: COMMIT;
 step s1u: UPDATE parent SET aux = 'bar';
-step s2u: <... completed>
-error in steps s1u s2u: ERROR:  deadlock detected
+step s1c: COMMIT;
+
+starting permutation: s2i s2u s1i s1u s1c s2c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2i s2u s1i s1u s2c s1c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
 step s2c: COMMIT;
+step s1u: <... completed>
 step s1c: COMMIT;
 
 starting permutation: s2i s2u s1i s2c s1u s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s2u: UPDATE parent SET aux = 'baz';
-step s1i: INSERT INTO child VALUES (1, 1); <waiting ...>
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2c: COMMIT;
+step s1u: UPDATE parent SET aux = 'bar';
+step s1c: COMMIT;
+
+starting permutation: s2i s2u s2c s1i s1u s1c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
 step s2c: COMMIT;
-step s1i: <... completed>
+step s1i: INSERT INTO child VALUES (1, 1);
 step s1u: UPDATE parent SET aux = 'bar';
 step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-deadlock2.out b/src/test/isolation/expected/fk-deadlock2.out
index 2d8e5e5b25..eda118550c 100644
--- a/src/test/isolation/expected/fk-deadlock2.out
+++ b/src/test/isolation/expected/fk-deadlock2.out
@@ -17,91 +17,138 @@ step s2u1: <... completed>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2c: COMMIT;
 
+starting permutation: s1u1 s1u2 s2u1 s2u2 s1c s2c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1u1 s1u2 s2u1 s2u2 s2c s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s1u1 s2u1 s1u2 s1c s2u2 s2c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
+
 starting permutation: s1u1 s2u1 s1u2 s2u2 s1c s2c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
-step s1c: COMMIT;
-step s2c: COMMIT;
+invalid permutation detected
 
 starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
 step s2c: COMMIT;
+step s1u2: <... completed>
 step s1c: COMMIT;
 
 starting permutation: s1u1 s2u1 s2u2 s1u2 s1c s2c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
-step s1c: COMMIT;
-step s2c: COMMIT;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
 
 starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
 step s1c: COMMIT;
 
+starting permutation: s1u1 s2u1 s2u2 s2c s1u2 s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s1u2 s1c s2u2 s2c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
+
 starting permutation: s2u1 s1u1 s1u2 s2u2 s1c s2c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
-step s1c: COMMIT;
-step s2c: COMMIT;
+invalid permutation detected
 
 starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
 step s2c: COMMIT;
+step s1u2: <... completed>
 step s1c: COMMIT;
 
 starting permutation: s2u1 s1u1 s2u2 s1u2 s1c s2c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
-step s1c: COMMIT;
-step s2c: COMMIT;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
 
 starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u2: <... completed>
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s2u2 s2c s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s1u1 s1u2 s1c s2c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2u1 s2u2 s1u1 s1u2 s2c s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
 step s1c: COMMIT;
 
 starting permutation: s2u1 s2u2 s1u1 s2c s1u2 s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; <waiting ...>
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2c: COMMIT;
-step s1u1: <... completed>
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s2c s1u1 s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-deadlock2_1.out b/src/test/isolation/expected/fk-deadlock2_1.out
index 30c4c99863..382734811c 100644
--- a/src/test/isolation/expected/fk-deadlock2_1.out
+++ b/src/test/isolation/expected/fk-deadlock2_1.out
@@ -19,92 +19,87 @@ step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 ERROR:  current transaction is aborted, commands ignored until end of transaction block
 step s2c: COMMIT;
 
-starting permutation: s1u1 s2u1 s1u2 s2u2 s1c s2c
+starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
+starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
-starting permutation: s1u1 s2u1 s2u2 s1u2 s1c s2c
+starting permutation: s1u1 s2u1 s2u2 s2c s1u2 s1c
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
-step s1c: COMMIT;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2c: COMMIT;
-
-starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
-step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
-step s2c: COMMIT;
+ERROR:  could not serialize access due to read/write dependencies among transactions
 step s1c: COMMIT;
 
-starting permutation: s2u1 s1u1 s1u2 s2u2 s1c s2c
+starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
+starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u2: <... completed>
-error in steps s2u2 s1u2: ERROR:  deadlock detected
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
-starting permutation: s2u1 s1u1 s2u2 s1u2 s1c s2c
+starting permutation: s2u1 s1u1 s2u2 s2c s1u2 s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
+ERROR:  could not serialize access due to read/write dependencies among transactions
 step s1c: COMMIT;
-step s2c: COMMIT;
 
-starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
+starting permutation: s2u1 s2u2 s1u1 s1u2 s2c s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
-step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
-step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s2u2: <... completed>
-error in steps s1u2 s2u2: ERROR:  deadlock detected
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
 step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
 starting permutation: s2u1 s2u2 s1u1 s2c s1u2 s1c
 step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
 step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
-step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; <waiting ...>
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
 step s2c: COMMIT;
-step s1u1: <... completed>
 step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
 ERROR:  could not serialize access due to read/write dependencies among transactions
 step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s2c s1u1 s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-deadlock2_2.out b/src/test/isolation/expected/fk-deadlock2_2.out
new file mode 100644
index 0000000000..b6be4b9892
--- /dev/null
+++ b/src/test/isolation/expected/fk-deadlock2_2.out
@@ -0,0 +1,105 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1u1 s1u2 s1c s2u1 s2u2 s2c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+
+starting permutation: s1u1 s1u2 s2u1 s1c s2u2 s2c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s1c: COMMIT;
+step s2u1: <... completed>
+error in steps s1c s2u1: ERROR:  could not serialize access due to concurrent update
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+ERROR:  current transaction is aborted, commands ignored until end of transaction block
+step s2c: COMMIT;
+
+starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s1u1 s2u1 s2u2 s2c s1u2 s1c
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s1u1 s2u2 s2c s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s1u1 s1u2 s2c s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; <waiting ...>
+step s2c: COMMIT;
+step s1u2: <... completed>
+error in steps s2c s1u2: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s1u1 s2c s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s2c: COMMIT;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2u1 s2u2 s2c s1u1 s1u2 s1c
+step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s2c: COMMIT;
+step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1;
+step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2;
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-deadlock_1.out b/src/test/isolation/expected/fk-deadlock_1.out
index ca75322cc1..d648e48c48 100644
--- a/src/test/isolation/expected/fk-deadlock_1.out
+++ b/src/test/isolation/expected/fk-deadlock_1.out
@@ -11,61 +11,57 @@ step s2c: COMMIT;
 starting permutation: s1i s1u s2i s1c s2u s2c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s1u: UPDATE parent SET aux = 'bar';
-step s2i: INSERT INTO child VALUES (2, 1); <waiting ...>
+step s2i: INSERT INTO child VALUES (2, 1);
 step s1c: COMMIT;
-step s2i: <... completed>
-error in steps s1c s2i: ERROR:  could not serialize access due to concurrent update
 step s2u: UPDATE parent SET aux = 'baz';
-ERROR:  current transaction is aborted, commands ignored until end of transaction block
+ERROR:  could not serialize access due to read/write dependencies among transactions
 step s2c: COMMIT;
 
 starting permutation: s1i s2i s1u s2u s1c s2c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s2i: INSERT INTO child VALUES (2, 1);
-step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
-step s2u: UPDATE parent SET aux = 'baz';
-step s1u: <... completed>
-error in steps s2u s1u: ERROR:  deadlock detected
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
 step s1c: COMMIT;
+step s2u: <... completed>
+error in steps s1c s2u: ERROR:  could not serialize access due to concurrent update
 step s2c: COMMIT;
 
 starting permutation: s1i s2i s2u s1u s2c s1c
 step s1i: INSERT INTO child VALUES (1, 1);
 step s2i: INSERT INTO child VALUES (2, 1);
-step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
-step s1u: UPDATE parent SET aux = 'bar';
-step s2u: <... completed>
-error in steps s1u s2u: ERROR:  deadlock detected
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
 step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
 starting permutation: s2i s1i s1u s2u s1c s2c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
-step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
-step s2u: UPDATE parent SET aux = 'baz';
-step s1u: <... completed>
-error in steps s2u s1u: ERROR:  deadlock detected
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
 step s1c: COMMIT;
+step s2u: <... completed>
+error in steps s1c s2u: ERROR:  could not serialize access due to concurrent update
 step s2c: COMMIT;
 
 starting permutation: s2i s1i s2u s1u s2c s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s1i: INSERT INTO child VALUES (1, 1);
-step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
-step s1u: UPDATE parent SET aux = 'bar';
-step s2u: <... completed>
-error in steps s1u s2u: ERROR:  deadlock detected
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
 step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
 step s1c: COMMIT;
 
 starting permutation: s2i s2u s1i s2c s1u s1c
 step s2i: INSERT INTO child VALUES (2, 1);
 step s2u: UPDATE parent SET aux = 'baz';
-step s1i: INSERT INTO child VALUES (1, 1); <waiting ...>
+step s1i: INSERT INTO child VALUES (1, 1);
 step s2c: COMMIT;
-step s1i: <... completed>
-error in steps s2c s1i: ERROR:  could not serialize access due to concurrent update
 step s1u: UPDATE parent SET aux = 'bar';
-ERROR:  current transaction is aborted, commands ignored until end of transaction block
+ERROR:  could not serialize access due to read/write dependencies among transactions
 step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-deadlock_2.out b/src/test/isolation/expected/fk-deadlock_2.out
new file mode 100644
index 0000000000..503a7d2823
--- /dev/null
+++ b/src/test/isolation/expected/fk-deadlock_2.out
@@ -0,0 +1,67 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1i s1u s1c s2i s2u s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s1c: COMMIT;
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s2c: COMMIT;
+
+starting permutation: s1i s1u s2i s1c s2u s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1c: COMMIT;
+step s2u: UPDATE parent SET aux = 'baz';
+ERROR:  could not serialize access due to concurrent update
+step s2c: COMMIT;
+
+starting permutation: s1i s2i s1u s2u s1c s2c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1c: COMMIT;
+step s2u: <... completed>
+error in steps s1c s2u: ERROR:  could not serialize access due to concurrent update
+step s2c: COMMIT;
+
+starting permutation: s1i s2i s2u s1u s2c s1c
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2i s1i s1u s2u s1c s2c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1i: INSERT INTO child VALUES (1, 1);
+step s1u: UPDATE parent SET aux = 'bar';
+step s2u: UPDATE parent SET aux = 'baz'; <waiting ...>
+step s1c: COMMIT;
+step s2u: <... completed>
+error in steps s1c s2u: ERROR:  could not serialize access due to concurrent update
+step s2c: COMMIT;
+
+starting permutation: s2i s1i s2u s1u s2c s1c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1u: UPDATE parent SET aux = 'bar'; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+error in steps s2c s1u: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s2i s2u s1i s2c s1u s1c
+step s2i: INSERT INTO child VALUES (2, 1);
+step s2u: UPDATE parent SET aux = 'baz';
+step s1i: INSERT INTO child VALUES (1, 1);
+step s2c: COMMIT;
+step s1u: UPDATE parent SET aux = 'bar';
+ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/fk-delete-insert.out b/src/test/isolation/expected/fk-delete-insert.out
new file mode 100644
index 0000000000..1ab15aaf76
--- /dev/null
+++ b/src/test/isolation/expected/fk-delete-insert.out
@@ -0,0 +1,41 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1d s1c s2i s2c
+step s1d: DELETE FROM A WHERE AID = 1;
+step s1c: COMMIT;
+step s2i: INSERT INTO B (BID,AID,Col2) VALUES (2,1,0);
+ERROR:  insert or update on table "b" violates foreign key constraint "b_aid_fkey"
+step s2c: COMMIT;
+
+starting permutation: s1d s2i s1c s2c
+step s1d: DELETE FROM A WHERE AID = 1;
+step s2i: INSERT INTO B (BID,AID,Col2) VALUES (2,1,0); <waiting ...>
+step s1c: COMMIT;
+step s2i: <... completed>
+error in steps s1c s2i: ERROR:  insert or update on table "b" violates foreign key constraint "b_aid_fkey"
+step s2c: COMMIT;
+
+starting permutation: s1d s2i s2c s1c
+step s1d: DELETE FROM A WHERE AID = 1;
+step s2i: INSERT INTO B (BID,AID,Col2) VALUES (2,1,0); <waiting ...>
+invalid permutation detected
+
+starting permutation: s2i s1d s1c s2c
+step s2i: INSERT INTO B (BID,AID,Col2) VALUES (2,1,0);
+step s1d: DELETE FROM A WHERE AID = 1; <waiting ...>
+invalid permutation detected
+
+starting permutation: s2i s1d s2c s1c
+step s2i: INSERT INTO B (BID,AID,Col2) VALUES (2,1,0);
+step s1d: DELETE FROM A WHERE AID = 1; <waiting ...>
+step s2c: COMMIT;
+step s1d: <... completed>
+error in steps s2c s1d: ERROR:  update or delete on table "a" violates foreign key constraint "b_aid_fkey" on table "b"
+step s1c: COMMIT;
+
+starting permutation: s2i s2c s1d s1c
+step s2i: INSERT INTO B (BID,AID,Col2) VALUES (2,1,0);
+step s2c: COMMIT;
+step s1d: DELETE FROM A WHERE AID = 1;
+ERROR:  update or delete on table "a" violates foreign key constraint "b_aid_fkey" on table "b"
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/lock-update-delete.out b/src/test/isolation/expected/lock-update-delete.out
new file mode 100644
index 0000000000..c4248657df
--- /dev/null
+++ b/src/test/isolation/expected/lock-update-delete.out
@@ -0,0 +1,65 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1b s2b s1s s2u s2d s1l s2c s1c
+step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
+step s2b: BEGIN;
+step s1s: SELECT * FROM foo;
+key            value          
+
+1              1              
+step s2u: UPDATE foo SET value = 2 WHERE key = 1;
+step s2d: DELETE FROM foo;
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s2c: COMMIT;
+step s1l: <... completed>
+error in steps s2c s1l: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s1b s2b s1s s2u s2d s1l s2r s1c
+step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
+step s2b: BEGIN;
+step s1s: SELECT * FROM foo;
+key            value          
+
+1              1              
+step s2u: UPDATE foo SET value = 2 WHERE key = 1;
+step s2d: DELETE FROM foo;
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s2r: ROLLBACK;
+step s1l: <... completed>
+key            value          
+
+1              1              
+step s1c: COMMIT;
+
+starting permutation: s1b s2b s1s s2u s2u2 s1l s2c s1c
+step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
+step s2b: BEGIN;
+step s1s: SELECT * FROM foo;
+key            value          
+
+1              1              
+step s2u: UPDATE foo SET value = 2 WHERE key = 1;
+step s2u2: UPDATE foo SET key = 2 WHERE key = 1;
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s2c: COMMIT;
+step s1l: <... completed>
+error in steps s2c s1l: ERROR:  could not serialize access due to concurrent update
+step s1c: COMMIT;
+
+starting permutation: s1b s2b s1s s2u s2u2 s1l s2r s1c
+step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
+step s2b: BEGIN;
+step s1s: SELECT * FROM foo;
+key            value          
+
+1              1              
+step s2u: UPDATE foo SET value = 2 WHERE key = 1;
+step s2u2: UPDATE foo SET key = 2 WHERE key = 1;
+step s1l: SELECT * FROM foo FOR KEY SHARE; <waiting ...>
+step s2r: ROLLBACK;
+step s1l: <... completed>
+key            value          
+
+1              1              
+step s1c: COMMIT;
diff --git a/src/test/isolation/expected/lock-update-traversal.out b/src/test/isolation/expected/lock-update-traversal.out
new file mode 100644
index 0000000000..c8e90661b2
--- /dev/null
+++ b/src/test/isolation/expected/lock-update-traversal.out
@@ -0,0 +1,18 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1b s2b s1s s2u s1l s2c s2d s1c
+step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
+step s2b: BEGIN;
+step s1s: SELECT * FROM foo;
+key            value          
+
+1              1              
+step s2u: UPDATE foo SET value = 2 WHERE key = 1;
+step s1l: SELECT * FROM foo FOR KEY SHARE;
+key            value          
+
+1              1              
+step s2c: COMMIT;
+step s2d: DELETE FROM foo WHERE key = 1; <waiting ...>
+step s1c: COMMIT;
+step s2d: <... completed>
diff --git a/src/test/isolation/expected/multixact-no-deadlock.out b/src/test/isolation/expected/multixact-no-deadlock.out
new file mode 100644
index 0000000000..5ba2e7818e
--- /dev/null
+++ b/src/test/isolation/expected/multixact-no-deadlock.out
@@ -0,0 +1,24 @@
+Parsed test spec with 3 sessions
+
+starting permutation: s1lock s2lock s1svpt s3lock s1lock2 s2c s1c s3c
+step s1lock: SELECT * FROM justthis FOR SHARE;
+value          
+
+1              
+step s2lock: SELECT * FROM justthis FOR SHARE;
+value          
+
+1              
+step s1svpt: SAVEPOINT foo;
+step s3lock: SELECT * FROM justthis FOR UPDATE; <waiting ...>
+step s1lock2: SELECT * FROM justthis FOR SHARE;
+value          
+
+1              
+step s2c: COMMIT;
+step s1c: COMMIT;
+step s3lock: <... completed>
+value          
+
+1              
+step s3c: COMMIT;
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
index 1d0770cd37..c4d6719de6 100644
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -14,4 +14,9 @@ test: fk-contention
 test: fk-deadlock
 test: fk-deadlock2
 test: eval-plan-qual
+test: lock-update-delete
+test: lock-update-traversal
+test: delete-abort-savept
+test: delete-abort-savept-2
+test: aborted-keyrevoke
 test: drop-index-concurrently-1
diff --git a/src/test/isolation/isolationtester.c b/src/test/isolation/isolationtester.c
index 4c4556654b..f1bb87d2f1 100644
--- a/src/test/isolation/isolationtester.c
+++ b/src/test/isolation/isolationtester.c
@@ -564,6 +564,7 @@ run_permutation(TestSpec * testspec, int nsteps, Step ** steps)
 			 * but it can only be unblocked by running steps from other
 			 * sessions.
 			 */
+			fflush(stdout);
 			fprintf(stderr, "invalid permutation detected\n");
 
 			/* Cancel the waiting statement from this session. */
diff --git a/src/test/isolation/specs/aborted-keyrevoke.spec b/src/test/isolation/specs/aborted-keyrevoke.spec
new file mode 100644
index 0000000000..c60aa0cebb
--- /dev/null
+++ b/src/test/isolation/specs/aborted-keyrevoke.spec
@@ -0,0 +1,31 @@
+# When a tuple that has been updated is locked, the locking command
+# should traverse the update chain; thus, a DELETE should not be able
+# to proceed until the lock has been released.
+
+setup
+{
+  CREATE TABLE foo (
+	key		int PRIMARY KEY,
+	value	int
+  );
+
+  INSERT INTO foo VALUES (1, 1);
+}
+
+teardown
+{
+  DROP TABLE foo;
+}
+
+session "s1"
+setup		{ BEGIN; }
+step "s1s"	{ SAVEPOINT f; }
+step "s1u"	{ UPDATE foo SET key = 2; }	# obtain KEY REVOKE
+step "s1r"	{ ROLLBACK TO f; } # lose KEY REVOKE
+step "s1l"	{ SELECT * FROM foo FOR KEY SHARE; }
+step "s1c"	{ COMMIT; }
+
+session "s2"
+setup		{ BEGIN; }
+step "s2l"	{ SELECT * FROM foo FOR KEY SHARE; }
+step "s2c"	{ COMMIT; }
diff --git a/src/test/isolation/specs/delete-abort-savept-2.spec b/src/test/isolation/specs/delete-abort-savept-2.spec
new file mode 100644
index 0000000000..d35c67f670
--- /dev/null
+++ b/src/test/isolation/specs/delete-abort-savept-2.spec
@@ -0,0 +1,34 @@
+# A funkier version of delete-abort-savept
+setup
+{
+  CREATE TABLE foo (
+     key INT PRIMARY KEY,
+     value INT
+  );
+
+  INSERT INTO foo VALUES (1, 1);
+}
+
+teardown
+{
+  DROP TABLE foo;
+}
+
+session "s1"
+setup			{ BEGIN; }
+step "s1l"		{ SELECT * FROM foo FOR KEY SHARE; }
+step "s1svp"	{ SAVEPOINT f; }
+step "s1d"		{ SELECT * FROM foo FOR NO KEY UPDATE; }
+step "s1r"		{ ROLLBACK TO f; }
+step "s1c"		{ COMMIT; }
+
+session "s2"
+setup			{ BEGIN; }
+step "s2l"		{ SELECT * FROM foo FOR UPDATE; }
+step "s2l2"		{ SELECT * FROM foo FOR NO KEY UPDATE; }
+step "s2c"		{ COMMIT; }
+
+permutation "s1l" "s1svp" "s1d" "s1r" "s2l" "s1c" "s2c"
+permutation "s1l" "s1svp" "s1d" "s2l" "s1r" "s1c" "s2c"
+permutation "s1l" "s1svp" "s1d" "s1r" "s2l2" "s1c" "s2c"
+permutation "s1l" "s1svp" "s1d" "s2l2" "s1r" "s1c" "s2c"
diff --git a/src/test/isolation/specs/delete-abort-savept.spec b/src/test/isolation/specs/delete-abort-savept.spec
new file mode 100644
index 0000000000..e41df20e89
--- /dev/null
+++ b/src/test/isolation/specs/delete-abort-savept.spec
@@ -0,0 +1,29 @@
+# After rolling back a subtransaction that upgraded a lock, the previously
+# held lock should still be held.
+setup
+{
+  CREATE TABLE foo (
+     key INT PRIMARY KEY,
+     value INT
+  );
+
+  INSERT INTO foo VALUES (1, 1);
+}
+
+teardown
+{
+  DROP TABLE foo;
+}
+
+session "s1"
+setup			{ BEGIN; }
+step "s1l"		{ SELECT * FROM foo FOR KEY SHARE; }
+step "s1svp"	{ SAVEPOINT f; }
+step "s1d"		{ DELETE FROM foo; }
+step "s1r"		{ ROLLBACK TO f; }
+step "s1c"		{ COMMIT; }
+
+session "s2"
+setup			{ BEGIN; }
+step "s2l"		{ SELECT * FROM foo FOR UPDATE; }
+step "s2c"		{ COMMIT; }
diff --git a/src/test/isolation/specs/fk-deadlock.spec b/src/test/isolation/specs/fk-deadlock.spec
index 9f46c6b665..44500d5b9b 100644
--- a/src/test/isolation/specs/fk-deadlock.spec
+++ b/src/test/isolation/specs/fk-deadlock.spec
@@ -29,26 +29,3 @@ setup		{ BEGIN; SET deadlock_timeout = '10s'; }
 step "s2i"	{ INSERT INTO child VALUES (2, 1); }
 step "s2u"	{ UPDATE parent SET aux = 'baz'; }
 step "s2c"	{ COMMIT; }
-
-## Most theoretical permutations require that a blocked session execute a
-## command, making them impossible in practice.
-permutation "s1i" "s1u" "s1c" "s2i" "s2u" "s2c"
-permutation "s1i" "s1u" "s2i" "s1c" "s2u" "s2c"
-#permutation "s1i" "s1u" "s2i" "s2u" "s1c" "s2c"
-#permutation "s1i" "s1u" "s2i" "s2u" "s2c" "s1c"
-#permutation "s1i" "s2i" "s1u" "s1c" "s2u" "s2c"
-permutation "s1i" "s2i" "s1u" "s2u" "s1c" "s2c"
-#permutation "s1i" "s2i" "s1u" "s2u" "s2c" "s1c"
-#permutation "s1i" "s2i" "s2u" "s1u" "s1c" "s2c"
-permutation "s1i" "s2i" "s2u" "s1u" "s2c" "s1c"
-#permutation "s1i" "s2i" "s2u" "s2c" "s1u" "s1c"
-#permutation "s2i" "s1i" "s1u" "s1c" "s2u" "s2c"
-permutation "s2i" "s1i" "s1u" "s2u" "s1c" "s2c"
-#permutation "s2i" "s1i" "s1u" "s2u" "s2c" "s1c"
-#permutation "s2i" "s1i" "s2u" "s1u" "s1c" "s2c"
-permutation "s2i" "s1i" "s2u" "s1u" "s2c" "s1c"
-#permutation "s2i" "s1i" "s2u" "s2c" "s1u" "s1c"
-#permutation "s2i" "s2u" "s1i" "s1u" "s1c" "s2c"
-#permutation "s2i" "s2u" "s1i" "s1u" "s2c" "s1c"
-permutation "s2i" "s2u" "s1i" "s2c" "s1u" "s1c"
-#permutation "s2i" "s2u" "s2c" "s1i" "s1u" "s1c"
diff --git a/src/test/isolation/specs/fk-deadlock2.spec b/src/test/isolation/specs/fk-deadlock2.spec
index a8f1516c4e..f500b26585 100644
--- a/src/test/isolation/specs/fk-deadlock2.spec
+++ b/src/test/isolation/specs/fk-deadlock2.spec
@@ -34,26 +34,3 @@ setup		{ BEGIN; SET deadlock_timeout = '10s'; }
 step "s2u1"	{ UPDATE B SET Col2 = 1 WHERE BID = 2; }
 step "s2u2"	{ UPDATE B SET Col2 = 1 WHERE BID = 2; }
 step "s2c"	{ COMMIT; }
-
-## Many theoretical permutations require that a blocked session execute a
-## command, making them impossible in practice.
-permutation "s1u1" "s1u2" "s1c" "s2u1" "s2u2" "s2c"
-permutation "s1u1" "s1u2" "s2u1" "s1c" "s2u2" "s2c"
-#permutation "s1u1" "s1u2" "s2u1" "s2u2" "s1c" "s2c"
-#permutation "s1u1" "s1u2" "s2u1" "s2u2" "s2c" "s1c"
-#permutation "s1u1" "s2u1" "s1u2" "s1c" "s2u2" "s2c"
-permutation "s1u1" "s2u1" "s1u2" "s2u2" "s1c" "s2c"
-permutation "s1u1" "s2u1" "s1u2" "s2u2" "s2c" "s1c"
-permutation "s1u1" "s2u1" "s2u2" "s1u2" "s1c" "s2c"
-permutation "s1u1" "s2u1" "s2u2" "s1u2" "s2c" "s1c"
-#permutation "s1u1" "s2u1" "s2u2" "s2c" "s1u2" "s1c"
-#permutation "s2u1" "s1u1" "s1u2" "s1c" "s2u2" "s2c"
-permutation "s2u1" "s1u1" "s1u2" "s2u2" "s1c" "s2c"
-permutation "s2u1" "s1u1" "s1u2" "s2u2" "s2c" "s1c"
-permutation "s2u1" "s1u1" "s2u2" "s1u2" "s1c" "s2c"
-permutation "s2u1" "s1u1" "s2u2" "s1u2" "s2c" "s1c"
-#permutation "s2u1" "s1u1" "s2u2" "s2c" "s1u2" "s1c"
-#permutation "s2u1" "s2u2" "s1u1" "s1u2" "s1c" "s2c"
-#permutation "s2u1" "s2u2" "s1u1" "s1u2" "s2c" "s1c"
-permutation "s2u1" "s2u2" "s1u1" "s2c" "s1u2" "s1c"
-#permutation "s2u1" "s2u2" "s2c" "s1u1" "s1u2" "s1c"
diff --git a/src/test/isolation/specs/lock-update-delete.spec b/src/test/isolation/specs/lock-update-delete.spec
new file mode 100644
index 0000000000..4b9a5a64ed
--- /dev/null
+++ b/src/test/isolation/specs/lock-update-delete.spec
@@ -0,0 +1,38 @@
+# If we update a tuple, and then delete (or update that touches the key) it,
+# and later somebody tries to come along and traverse that update chain,
+# he should get an error when locking the latest version, if the delete
+# committed; or succeed, when the deleting transaction rolls back.
+
+setup
+{
+  CREATE TABLE foo (
+	key		int PRIMARY KEY,
+	value	int
+  );
+
+  INSERT INTO foo VALUES (1, 1);
+}
+
+teardown
+{
+  DROP TABLE foo;
+}
+
+session "s1"
+step "s1b"	{ BEGIN ISOLATION LEVEL REPEATABLE READ; }
+step "s1s"	{ SELECT * FROM foo; }	# obtain snapshot
+step "s1l"	{ SELECT * FROM foo FOR KEY SHARE; } # obtain lock
+step "s1c"	{ COMMIT; }
+
+session "s2"
+step "s2b"	{ BEGIN; }
+step "s2u"	{ UPDATE foo SET value = 2 WHERE key = 1; }
+step "s2d"	{ DELETE FROM foo; }
+step "s2u2"	{ UPDATE foo SET key = 2 WHERE key = 1; }
+step "s2c"	{ COMMIT; }
+step "s2r"	{ ROLLBACK; }
+
+permutation "s1b" "s2b" "s1s" "s2u" "s2d" "s1l" "s2c" "s1c"
+permutation "s1b" "s2b" "s1s" "s2u" "s2d" "s1l" "s2r" "s1c"
+permutation "s1b" "s2b" "s1s" "s2u" "s2u2" "s1l" "s2c" "s1c"
+permutation "s1b" "s2b" "s1s" "s2u" "s2u2" "s1l" "s2r" "s1c"
diff --git a/src/test/isolation/specs/lock-update-traversal.spec b/src/test/isolation/specs/lock-update-traversal.spec
new file mode 100644
index 0000000000..6c6c805d50
--- /dev/null
+++ b/src/test/isolation/specs/lock-update-traversal.spec
@@ -0,0 +1,32 @@
+# When a tuple that has been updated is locked, the locking command
+# should traverse the update chain; thus, a DELETE should not be able
+# to proceed until the lock has been released.
+
+setup
+{
+  CREATE TABLE foo (
+	key		int PRIMARY KEY,
+	value	int
+  );
+
+  INSERT INTO foo VALUES (1, 1);
+}
+
+teardown
+{
+  DROP TABLE foo;
+}
+
+session "s1"
+step "s1b"	{ BEGIN ISOLATION LEVEL REPEATABLE READ; }
+step "s1s"	{ SELECT * FROM foo; }	# obtain snapshot
+step "s1l"	{ SELECT * FROM foo FOR KEY SHARE; } # obtain lock
+step "s1c"	{ COMMIT; }
+
+session "s2"
+step "s2b"	{ BEGIN; }
+step "s2u"	{ UPDATE foo SET value = 2 WHERE key = 1; }
+step "s2c"	{ COMMIT; }
+step "s2d"	{ DELETE FROM foo WHERE key = 1; }
+
+permutation "s1b" "s2b" "s1s" "s2u" "s1l" "s2c" "s2d" "s1c"
diff --git a/src/test/isolation/specs/multixact-no-deadlock.spec b/src/test/isolation/specs/multixact-no-deadlock.spec
new file mode 100644
index 0000000000..205658b897
--- /dev/null
+++ b/src/test/isolation/specs/multixact-no-deadlock.spec
@@ -0,0 +1,35 @@
+# If we already hold a lock of a given strength, do not deadlock when
+# some other transaction is waiting for a conflicting lock and we try
+# to acquire the same lock we already held.
+setup
+{
+  CREATE TABLE justthis (
+	value	int
+  );
+
+  INSERT INTO justthis VALUES (1);
+}
+
+teardown
+{
+  DROP TABLE justthis;
+}
+
+session "s1"
+setup			{ BEGIN; }
+step "s1lock"	{ SELECT * FROM justthis FOR SHARE; }
+step "s1svpt"	{ SAVEPOINT foo; }
+step "s1lock2"	{ SELECT * FROM justthis FOR SHARE; }
+step "s1c"		{ COMMIT; }
+
+session "s2"
+setup			{ BEGIN; }
+step "s2lock"	{ SELECT * FROM justthis FOR SHARE; }	# ensure it's a multi
+step "s2c"		{ COMMIT; }
+
+session "s3"
+setup			{ BEGIN; }
+step "s3lock"	{ SELECT * FROM justthis FOR UPDATE; }
+step "s3c"		{ COMMIT; }
+
+permutation "s1lock" "s2lock" "s1svpt" "s3lock" "s1lock2" "s2c" "s1c" "s3c"
-- 
2.50.1