From b9b8831ad60f6e4bd580fe6dbe9749359298a3c4 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 7 Feb 2010 20:48:13 +0000
Subject: [PATCH] Create a "relation mapping" infrastructure to support
 changing the relfilenodes of shared or nailed system catalogs.  This has two
 key benefits:

* The new CLUSTER-based VACUUM FULL can be applied safely to all catalogs.

* We no longer have to use an unsafe reindex-in-place approach for reindexing
  shared catalogs.

CLUSTER on nailed catalogs now works too, although I left it disabled on
shared catalogs because the resulting pg_index.indisclustered update would
only be visible in one database.

Since reindexing shared system catalogs is now fully transactional and
crash-safe, the former special cases in REINDEX behavior have been removed;
shared catalogs are treated the same as non-shared.

This commit does not do anything about the recently-discussed problem of
deadlocks between VACUUM FULL/CLUSTER on a system catalog and other
concurrent queries; will address that in a separate patch.  As a stopgap,
parallel_schedule has been tweaked to run vacuum.sql by itself, to avoid
such failures during the regression tests.
---
 contrib/oid2name/oid2name.c          |  22 +-
 doc/src/sgml/catalogs.sgml           |   6 +-
 doc/src/sgml/diskusage.sgml          |  34 +-
 doc/src/sgml/func.sgml               |  64 +-
 doc/src/sgml/pgbuffercache.sgml      |   6 +-
 doc/src/sgml/ref/cluster.sgml        |  12 +-
 doc/src/sgml/ref/reindex.sgml        |  44 +-
 doc/src/sgml/storage.sgml            |  17 +-
 src/backend/access/index/genam.c     |   5 +-
 src/backend/access/transam/rmgr.c    |   5 +-
 src/backend/access/transam/xact.c    |  41 +-
 src/backend/access/transam/xlog.c    |   4 +-
 src/backend/bootstrap/bootparse.y    |  27 +-
 src/backend/bootstrap/bootstrap.c    |   9 +-
 src/backend/catalog/catalog.c        |   8 +-
 src/backend/catalog/heap.c           |  51 +-
 src/backend/catalog/index.c          | 244 ++++---
 src/backend/catalog/storage.c        |  60 +-
 src/backend/catalog/toasting.c       |   9 +-
 src/backend/commands/cluster.c       | 434 +++++++++----
 src/backend/commands/indexcmds.c     |  26 +-
 src/backend/commands/tablecmds.c     |  82 ++-
 src/backend/commands/vacuum.c        |   9 +-
 src/backend/executor/execMain.c      |   3 +-
 src/backend/parser/parse_clause.c    |   4 +-
 src/backend/utils/adt/dbsize.c       | 123 +++-
 src/backend/utils/cache/Makefile     |   4 +-
 src/backend/utils/cache/catcache.c   |  62 +-
 src/backend/utils/cache/inval.c      | 133 +++-
 src/backend/utils/cache/relcache.c   | 140 +++-
 src/backend/utils/cache/relmapper.c  | 913 +++++++++++++++++++++++++++
 src/backend/utils/init/miscinit.c    |  58 +-
 src/bin/pg_dump/pg_dump.c            |   8 +-
 src/include/access/rmgr.h            |   3 +-
 src/include/catalog/catalog.h        |   5 +-
 src/include/catalog/catversion.h     |   4 +-
 src/include/catalog/heap.h           |  10 +-
 src/include/catalog/index.h          |   7 +-
 src/include/catalog/pg_class.h       |  11 +-
 src/include/catalog/pg_proc.h        |   6 +-
 src/include/catalog/storage.h        |   3 +-
 src/include/commands/cluster.h       |  10 +-
 src/include/miscadmin.h              |   6 +-
 src/include/storage/lwlock.h         |   3 +-
 src/include/storage/relfilenode.h    |   6 +-
 src/include/storage/sinval.h         |  53 +-
 src/include/utils/builtins.h         |   4 +-
 src/include/utils/catcache.h         |   3 +-
 src/include/utils/inval.h            |   8 +-
 src/include/utils/rel.h              |  12 +-
 src/include/utils/relcache.h         |   7 +-
 src/include/utils/relmapper.h        |  62 ++
 src/test/regress/expected/vacuum.out |   2 +-
 src/test/regress/parallel_schedule   |   7 +-
 54 files changed, 2315 insertions(+), 584 deletions(-)
 create mode 100644 src/backend/utils/cache/relmapper.c
 create mode 100644 src/include/utils/relmapper.h

diff --git a/contrib/oid2name/oid2name.c b/contrib/oid2name/oid2name.c
index 9d896b84f5..52d6fafeaf 100644
--- a/contrib/oid2name/oid2name.c
+++ b/contrib/oid2name/oid2name.c
@@ -5,7 +5,7 @@
  * Originally by
  * B. Palmer, bpalmer@crimelabs.net 1-17-2001
  *
- * $PostgreSQL: pgsql/contrib/oid2name/oid2name.c,v 1.36 2009/06/11 14:48:51 momjian Exp $
+ * $PostgreSQL: pgsql/contrib/oid2name/oid2name.c,v 1.37 2010/02/07 20:48:08 tgl Exp $
  */
 #include "postgres_fe.h"
 
@@ -440,7 +440,7 @@ sql_exec_dumpalldbs(PGconn *conn, struct options * opts)
 	/* get the oid and database name from the system pg_database table */
 	snprintf(todo, sizeof(todo),
 			 "SELECT d.oid AS \"Oid\", datname AS \"Database Name\", "
-	  "spcname AS \"Tablespace\" FROM pg_database d JOIN pg_tablespace t ON "
+	  "spcname AS \"Tablespace\" FROM pg_catalog.pg_database d JOIN pg_catalog.pg_tablespace t ON "
 			 "(dattablespace = t.oid) ORDER BY 2");
 
 	sql_exec(conn, todo, opts->quiet);
@@ -456,10 +456,10 @@ sql_exec_dumpalltables(PGconn *conn, struct options * opts)
 	char	   *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\" ";
 
 	snprintf(todo, sizeof(todo),
-		  "SELECT relfilenode as \"Filenode\", relname as \"Table Name\" %s "
+		  "SELECT pg_catalog.pg_relation_filenode(c.oid) as \"Filenode\", relname as \"Table Name\" %s "
 			 "FROM pg_class c "
 		   "	LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace "
-	"	LEFT JOIN pg_catalog.pg_database d ON d.datname = current_database(),"
+	"	LEFT JOIN pg_catalog.pg_database d ON d.datname = pg_catalog.current_database(),"
 			 "	pg_catalog.pg_tablespace t "
 			 "WHERE relkind IN ('r'%s%s) AND "
 			 "	%s"
@@ -477,7 +477,7 @@ sql_exec_dumpalltables(PGconn *conn, struct options * opts)
 }
 
 /*
- * Show oid, relfilenode, name, schema and tablespace for each of the
+ * Show oid, filenode, name, schema and tablespace for each of the
  * given objects in the current database.
  */
 void
@@ -492,7 +492,7 @@ sql_exec_searchtables(PGconn *conn, struct options * opts)
 	bool		written = false;
 	char	   *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\" ";
 
-	/* get tables qualifiers, whether names, relfilenodes, or OIDs */
+	/* get tables qualifiers, whether names, filenodes, or OIDs */
 	comma_oids = get_comma_elts(opts->oids);
 	comma_tables = get_comma_elts(opts->tables);
 	comma_filenodes = get_comma_elts(opts->filenodes);
@@ -511,7 +511,7 @@ sql_exec_searchtables(PGconn *conn, struct options * opts)
 	{
 		if (written)
 			ptr += sprintf(ptr, " OR ");
-		ptr += sprintf(ptr, "c.relfilenode IN (%s)", comma_filenodes);
+		ptr += sprintf(ptr, "pg_catalog.pg_relation_filenode(c.oid) IN (%s)", comma_filenodes);
 		written = true;
 	}
 	if (opts->tables->num > 0)
@@ -527,10 +527,10 @@ sql_exec_searchtables(PGconn *conn, struct options * opts)
 	/* now build the query */
 	todo = (char *) myalloc(650 + strlen(qualifiers));
 	snprintf(todo, 650 + strlen(qualifiers),
-		 "SELECT relfilenode as \"Filenode\", relname as \"Table Name\" %s\n"
-			 "FROM pg_class c \n"
+		 "SELECT pg_catalog.pg_relation_filenode(c.oid) as \"Filenode\", relname as \"Table Name\" %s\n"
+			 "FROM pg_catalog.pg_class c \n"
 		 "	LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace \n"
-			 "	LEFT JOIN pg_catalog.pg_database d ON d.datname = current_database(),\n"
+			 "	LEFT JOIN pg_catalog.pg_database d ON d.datname = pg_catalog.current_database(),\n"
 			 "	pg_catalog.pg_tablespace t \n"
 			 "WHERE relkind IN ('r', 'i', 'S', 't') AND \n"
 			 "		t.oid = CASE\n"
@@ -554,7 +554,7 @@ sql_exec_dumpalltbspc(PGconn *conn, struct options * opts)
 
 	snprintf(todo, sizeof(todo),
 			 "SELECT oid AS \"Oid\", spcname as \"Tablespace Name\"\n"
-			 "FROM pg_tablespace");
+			 "FROM pg_catalog.pg_tablespace");
 
 	sql_exec(conn, todo, opts->quiet);
 }
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index ef0a615bcc..3503bc852c 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.220 2010/02/03 17:25:05 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.221 2010/02/07 20:48:09 tgl Exp $ -->
 <!--
  Documentation of the system catalogs, directed toward PostgreSQL developers
  -->
@@ -1473,7 +1473,9 @@
       <entry><structfield>relfilenode</structfield></entry>
       <entry><type>oid</type></entry>
       <entry></entry>
-      <entry>Name of the on-disk file of this relation; 0 if none</entry>
+      <entry>Name of the on-disk file of this relation; zero means this
+       is a <quote>mapped</> relation whose disk file name is determined
+       by low-level state</entry>
      </row>
 
      <row>
diff --git a/doc/src/sgml/diskusage.sgml b/doc/src/sgml/diskusage.sgml
index 67f50c5f09..aa64e4228e 100644
--- a/doc/src/sgml/diskusage.sgml
+++ b/doc/src/sgml/diskusage.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/diskusage.sgml,v 1.19 2010/02/03 17:25:05 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/diskusage.sgml,v 1.20 2010/02/07 20:48:09 tgl Exp $ -->
 
 <chapter id="diskusage">
  <title>Monitoring Disk Usage</title>
@@ -29,30 +29,31 @@
   </para>
 
   <para>
-   You can monitor disk space three ways:  using
-   SQL functions listed in <xref linkend="functions-admin-dbsize">,
-   using <command>VACUUM</> information, and from the command line 
-   using the tools in <filename>contrib/oid2name</>.  The SQL functions
-   are the easiest to use and report information about tables, tables with
-   indexes and long value storage (TOAST), databases, and tablespaces.
+   You can monitor disk space in three ways:
+   using the SQL functions listed in <xref linkend="functions-admin-dbsize">,
+   using the tools in <filename>contrib/oid2name</>, or
+   using manual inspection of the system catalogs.
+   The SQL functions are the easiest to use and are generally recommended.
+   <filename>contrib/oid2name</> is described in <xref linkend="oid2name">.
+   The remainder of this section shows how to do it by inspection of the
+   system catalogs.
   </para>
 
   <para>
    Using <application>psql</> on a recently vacuumed or analyzed database,
    you can issue queries to see the disk usage of any table:
 <programlisting>
-SELECT relfilenode, relpages FROM pg_class WHERE relname = 'customer';
+SELECT pg_relation_filepath(oid), relpages FROM pg_class WHERE relname = 'customer';
 
- relfilenode | relpages 
--------------+----------
-       16806 |       60
+ pg_relation_filepath | relpages 
+----------------------+----------
+ base/16384/16806     |       60
 (1 row)
 </programlisting>
    Each page is typically 8 kilobytes. (Remember, <structfield>relpages</>
    is only updated by <command>VACUUM</>, <command>ANALYZE</>, and
-   a few DDL commands such as <command>CREATE INDEX</>.)  The
-   <structfield>relfilenode</> value is of interest if you want to examine
-   the table's disk file directly.
+   a few DDL commands such as <command>CREATE INDEX</>.)  The file pathname
+   is of interest if you want to examine the table's disk file directly.
   </para>
 
   <para>
@@ -107,11 +108,6 @@ ORDER BY relpages DESC;
  customer             |     3144
 </programlisting>
   </para>
-
-  <para>
-   You can also use <filename>contrib/oid2name</> to show disk usage; see
-   <xref linkend="oid2name"> for more details and examples.
-  </para>
  </sect1>
 
  <sect1 id="disk-full">
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 121515d576..fed003c4d0 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.500 2010/02/01 15:38:21 rhaas Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.501 2010/02/07 20:48:09 tgl Exp $ -->
 
  <chapter id="functions">
   <title>Functions and Operators</title>
@@ -13434,6 +13434,68 @@ postgres=# SELECT * FROM pg_xlogfile_name_offset(pg_stop_backup());
     appropriate.
    </para>
 
+   <para>
+    The functions shown in <xref linkend="functions-admin-dblocation"> assist
+    in identifying the specific disk files associated with database objects.
+   </para>
+
+   <indexterm>
+    <primary>pg_relation_filenode</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_relation_filepath</primary>
+   </indexterm>
+
+   <table id="functions-admin-dblocation">
+    <title>Database Object Location Functions</title>
+    <tgroup cols="3">
+     <thead>
+      <row><entry>Name</entry> <entry>Return Type</entry> <entry>Description</entry>
+      </row>
+     </thead>
+
+     <tbody>
+      <row>
+       <entry>
+        <literal><function>pg_relation_filenode</function>(<parameter>relation</parameter> <type>regclass</type>)</literal>
+        </entry>
+       <entry><type>oid</type></entry>
+       <entry>
+        Filenode number of the relation with the specified OID or name
+       </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_relation_filepath</function>(<parameter>relation</parameter> <type>regclass</type>)</literal>
+        </entry>
+       <entry><type>text</type></entry>
+       <entry>
+        File path name of the relation with the specified OID or name
+       </entry>
+      </row>
+     </tbody>
+    </tgroup>
+   </table>
+
+   <para>
+    <function>pg_relation_filenode</> accepts the OID or name of a table,
+    index, sequence, or toast table, and returns the <quote>filenode</> number
+    currently assigned to it.  The filenode is the base component of the file
+    name(s) used for the relation (see <xref linkend="storage-file-layout">
+    for more information).  For most tables the result is the same as
+    <structname>pg_class</>.<structfield>relfilenode</>, but for certain
+    system catalogs <structfield>relfilenode</> is zero and this function must
+    be used to get the correct value.  The function returns NULL if passed
+    a relation that does not have storage, such as a view.
+   </para>
+
+   <para>
+    <function>pg_relation_filepath</> is similar to
+    <function>pg_relation_filenode</>, but it returns the entire file pathname
+    (relative to the database cluster's data directory <varname>PGDATA</>) of
+    the relation.
+   </para>
+
    <para>
     The functions shown in <xref
     linkend="functions-admin-genfile"> provide native access to
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index a80a910d67..3ea74ec507 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/pgbuffercache.sgml,v 2.5 2009/05/18 11:08:24 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/pgbuffercache.sgml,v 2.6 2010/02/07 20:48:09 tgl Exp $ -->
 
 <sect1 id="pgbuffercache">
  <title>pg_buffercache</title>
@@ -56,7 +56,7 @@
       <entry><structfield>relfilenode</structfield></entry>
       <entry><type>oid</type></entry>
       <entry><literal>pg_class.relfilenode</literal></entry>
-      <entry>Relfilenode of the relation</entry>
+      <entry>Filenode number of the relation</entry>
      </row>
 
      <row>
@@ -137,7 +137,7 @@
   <programlisting>
   regression=# SELECT c.relname, count(*) AS buffers
                FROM pg_buffercache b INNER JOIN pg_class c
-               ON b.relfilenode = c.relfilenode AND
+               ON b.relfilenode = pg_relation_filenode(c.oid) AND
                   b.reldatabase IN (0, (SELECT oid FROM pg_database
                                         WHERE datname = current_database()))
                GROUP BY c.relname
diff --git a/doc/src/sgml/ref/cluster.sgml b/doc/src/sgml/ref/cluster.sgml
index 9d186aeb3c..4c690d9eda 100644
--- a/doc/src/sgml/ref/cluster.sgml
+++ b/doc/src/sgml/ref/cluster.sgml
@@ -1,5 +1,5 @@
 <!--
-$PostgreSQL: pgsql/doc/src/sgml/ref/cluster.sgml,v 1.47 2009/09/19 10:23:26 petere Exp $
+$PostgreSQL: pgsql/doc/src/sgml/ref/cluster.sgml,v 1.48 2010/02/07 20:48:09 tgl Exp $
 PostgreSQL documentation
 -->
 
@@ -30,12 +30,12 @@ CLUSTER [VERBOSE]
   <title>Description</title>
 
   <para>
-   <command>CLUSTER</command> instructs <productname>PostgreSQL</productname> 
+   <command>CLUSTER</command> instructs <productname>PostgreSQL</productname>
    to cluster the table specified
    by <replaceable class="parameter">table_name</replaceable>
    based on the index specified by
    <replaceable class="parameter">index_name</replaceable>. The index must
-   already have been defined on 
+   already have been defined on
    <replaceable class="parameter">table_name</replaceable>.
   </para>
 
@@ -46,9 +46,9 @@ CLUSTER [VERBOSE]
    not clustered.  That is, no attempt is made to store new or
    updated rows according to their index order.  (If one wishes, one can
    periodically recluster by issuing the command again.  Also, setting
-   the table's <literal>FILLFACTOR</literal> storage parameter to less than 100% can aid
-   in preserving cluster ordering during updates, since updated rows
-   are preferentially kept on the same page.)
+   the table's <literal>FILLFACTOR</literal> storage parameter to less than
+   100% can aid in preserving cluster ordering during updates, since updated
+   rows are kept on the same page if enough space is available there.)
   </para>
 
   <para>
diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml
index b4b1466f5c..dc75d6e6b2 100644
--- a/doc/src/sgml/ref/reindex.sgml
+++ b/doc/src/sgml/ref/reindex.sgml
@@ -1,5 +1,5 @@
 <!--
-$PostgreSQL: pgsql/doc/src/sgml/ref/reindex.sgml,v 1.38 2008/11/14 10:22:47 petere Exp $
+$PostgreSQL: pgsql/doc/src/sgml/ref/reindex.sgml,v 1.39 2010/02/07 20:48:09 tgl Exp $
 PostgreSQL documentation
 -->
 
@@ -77,7 +77,7 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
    </itemizedlist>
   </para>
  </refsect1>
-  
+
  <refsect1>
   <title>Parameters</title>
 
@@ -106,9 +106,9 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
     <listitem>
      <para>
       Recreate all indexes within the current database.
-      Indexes on shared system catalogs are skipped except in stand-alone mode
-      (see below). This form of <command>REINDEX</command> cannot be executed 
-      inside a transaction block.
+      Indexes on shared system catalogs are also processed.
+      This form of <command>REINDEX</command> cannot be executed inside a
+      transaction block.
      </para>
     </listitem>
    </varlistentry>
@@ -118,8 +118,8 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
     <listitem>
      <para>
       Recreate all indexes on system catalogs within the current database.
-      Indexes on user tables are not processed.  Also, indexes on shared
-      system catalogs are skipped except in stand-alone mode (see below).
+      Indexes on shared system catalogs are included.
+      Indexes on user tables are not processed.
       This form of <command>REINDEX</command> cannot be executed inside a
       transaction block.
      </para>
@@ -134,7 +134,7 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
       reindexed.  Index and table names can be schema-qualified.
       Presently, <command>REINDEX DATABASE</> and <command>REINDEX SYSTEM</>
       can only reindex the current database, so their parameter must match
-      the current database's name. 
+      the current database's name.
      </para>
     </listitem>
    </varlistentry>
@@ -156,7 +156,7 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
   <para>
    If you suspect corruption of an index on a user table, you can
    simply rebuild that index, or all indexes on the table, using
-   <command>REINDEX INDEX</command> or <command>REINDEX TABLE</command>.  
+   <command>REINDEX INDEX</command> or <command>REINDEX TABLE</command>.
   </para>
 
   <para>
@@ -197,30 +197,6 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
    have been completed.
   </para>
 
-  <para>
-   If corruption is suspected in the indexes of any of the shared
-   system catalogs (which are <structname>pg_authid</structname>,
-   <structname>pg_auth_members</structname>,
-   <structname>pg_database</structname>,
-   <structname>pg_pltemplate</structname>,
-   <structname>pg_shdepend</structname>,
-   <structname>pg_shdescription</structname>, and
-   <structname>pg_tablespace</structname>), then a standalone server
-   must be used to repair it.  <command>REINDEX</> will not process
-   shared catalogs in multiuser mode.
-  </para>
-
-  <para>
-   For all indexes except the shared system catalogs, <command>REINDEX</>
-   is crash-safe and transaction-safe.  <command>REINDEX</> is not
-   crash-safe for shared indexes, which is why this case is disallowed
-   during normal operation.  If a failure occurs while reindexing one
-   of these catalogs in standalone mode, it will not be possible to
-   restart the regular server until the problem is rectified.  (The
-   typical symptom of a partially rebuilt shared index is <quote>index is not
-   a btree</> errors.)
-  </para>
-
   <para>
    <command>REINDEX</command> is similar to a drop and recreate of the index
    in that the index contents are rebuilt from scratch.  However, the locking
@@ -290,7 +266,7 @@ broken_db=&gt; \q
 </programlisting>
   </para>
  </refsect1>
- 
+
  <refsect1>
   <title>Compatibility</title>
 
diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml
index fcdbd0ee36..e0cef7dd7b 100644
--- a/doc/src/sgml/storage.sgml
+++ b/doc/src/sgml/storage.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/storage.sgml,v 1.30 2009/07/22 01:21:22 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/storage.sgml,v 1.31 2010/02/07 20:48:09 tgl Exp $ -->
 
 <chapter id="storage">
 
@@ -147,6 +147,11 @@ Note that while a table's filenode often matches its OID, this is
 <command>TRUNCATE</>, <command>REINDEX</>, <command>CLUSTER</> and some forms
 of <command>ALTER TABLE</>, can change the filenode while preserving the OID.
 Avoid assuming that filenode and table OID are the same.
+Also, for certain system catalogs including <structname>pg_class</> itself,
+<structname>pg_class</>.<structfield>relfilenode</> contains zero.  The
+actual filenode number of these catalogs is stored in a lower-level data
+structure, and can be obtained using the <function>pg_relation_filenode()</>
+function.
 </para>
 </caution>
 
@@ -188,6 +193,16 @@ tablespace is not accessed through <filename>pg_tblspc</>, but corresponds to
 <varname>PGDATA</><filename>/global</>.
 </para>
 
+<para>
+The <function>pg_relation_filepath()</> function shows the entire path
+(relative to <varname>PGDATA</>) of any relation.  It is often useful
+as a substitute for remembering many of the above rules.  But keep in
+mind that this function just gives the name of the first segment of the
+main fork of the relation &mdash; you may need to append a segment number
+and/or <literal>_fsm</> or <literal>_vm</> to find all the files associated
+with the relation.
+</para>
+
 <para>
 Temporary files (for operations such as sorting more data than can fit in
 memory) are created within <varname>PGDATA</><filename>/base/pgsql_tmp</>,
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index ddc3f6b4f9..bd28036087 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.79 2010/01/02 16:57:35 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.80 2010/02/07 20:48:09 tgl Exp $
  *
  * NOTES
  *	  many of the old access method routines have been turned into
@@ -21,6 +21,7 @@
 
 #include "access/relscan.h"
 #include "access/transam.h"
+#include "catalog/index.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
@@ -419,7 +420,7 @@ systable_beginscan_ordered(Relation heapRelation,
 
 	/* REINDEX can probably be a hard error here ... */
 	if (ReindexIsProcessingIndex(RelationGetRelid(indexRelation)))
-		elog(ERROR, "cannot do ordered scan on index \"%s\", because it is the current REINDEX target",
+		elog(ERROR, "cannot do ordered scan on index \"%s\", because it is being reindexed",
 			 RelationGetRelationName(indexRelation));
 	/* ... but we only throw a warning about violating IgnoreSystemIndexes */
 	if (IgnoreSystemIndexes)
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 7e1e0f60fc..8038b25d1d 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -3,7 +3,7 @@
  *
  * Resource managers definition
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.28 2009/12/19 01:32:33 sriggs Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.29 2010/02/07 20:48:09 tgl Exp $
  */
 #include "postgres.h"
 
@@ -22,6 +22,7 @@
 #include "commands/tablespace.h"
 #include "storage/freespace.h"
 #include "storage/standby.h"
+#include "utils/relmapper.h"
 
 
 const RmgrData RmgrTable[RM_MAX_ID + 1] = {
@@ -32,7 +33,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = {
 	{"Database", dbase_redo, dbase_desc, NULL, NULL, NULL},
 	{"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL},
 	{"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL},
-	{"Reserved 7", NULL, NULL, NULL, NULL, NULL},
+	{"RelMap", relmap_redo, relmap_desc, NULL, NULL, NULL},
 	{"Standby", standby_redo, standby_desc, NULL, NULL, NULL},
 	{"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL},
 	{"Heap", heap_redo, heap_desc, NULL, NULL, NULL},
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index f74a941f66..156ed5c47b 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.282 2010/01/24 21:49:17 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.283 2010/02/07 20:48:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -48,6 +48,7 @@
 #include "utils/inval.h"
 #include "utils/memutils.h"
 #include "utils/relcache.h"
+#include "utils/relmapper.h"
 #include "utils/snapmgr.h"
 #include "pg_trace.h"
 
@@ -250,7 +251,7 @@ static void AbortTransaction(void);
 static void AtAbort_Memory(void);
 static void AtCleanup_Memory(void);
 static void AtAbort_ResourceOwner(void);
-static void AtCommit_LocalCache(void);
+static void AtCCI_LocalCache(void);
 static void AtCommit_Memory(void);
 static void AtStart_Cache(void);
 static void AtStart_Memory(void);
@@ -703,7 +704,7 @@ CommandCounterIncrement(void)
 		 * read-only command.  (But see hacks in inval.c to make real sure we
 		 * don't think a command that queued inval messages was read-only.)
 		 */
-		AtCommit_LocalCache();
+		AtCCI_LocalCache();
 	}
 
 	/*
@@ -1095,11 +1096,19 @@ cleanup:
 
 
 /*
- *	AtCommit_LocalCache
+ *	AtCCI_LocalCache
  */
 static void
-AtCommit_LocalCache(void)
+AtCCI_LocalCache(void)
 {
+	/*
+	 * Make any pending relation map changes visible.  We must do this
+	 * before processing local sinval messages, so that the map changes
+	 * will get reflected into the relcache when relcache invals are
+	 * processed.
+	 */
+	AtCCI_RelationMap();
+
 	/*
 	 * Make catalog changes visible to me for the next command.
 	 */
@@ -1734,6 +1743,9 @@ CommitTransaction(void)
 	/* Prevent cancel/die interrupt while cleaning up */
 	HOLD_INTERRUPTS();
 
+	/* Commit updates to the relation map --- do this as late as possible */
+	AtEOXact_RelationMap(true);
+
 	/*
 	 * set the current transaction state information appropriately during
 	 * commit processing
@@ -1980,6 +1992,7 @@ PrepareTransaction(void)
 	AtPrepare_Locks();
 	AtPrepare_PgStat();
 	AtPrepare_MultiXact();
+	AtPrepare_RelationMap();
 
 	/*
 	 * Here is where we really truly prepare.
@@ -2148,10 +2161,11 @@ AbortTransaction(void)
 	/*
 	 * do abort processing
 	 */
-	AfterTriggerEndXact(false);
+	AfterTriggerEndXact(false);			/* 'false' means it's abort */
 	AtAbort_Portals();
-	AtEOXact_LargeObject(false);	/* 'false' means it's abort */
+	AtEOXact_LargeObject(false);
 	AtAbort_Notify();
+	AtEOXact_RelationMap(false);
 
 	/*
 	 * Advertise the fact that we aborted in pg_clog (assuming that we got as
@@ -4625,11 +4639,18 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
 			SharedInvalidationMessage *msg = &msgs[i];
 
 			if (msg->id >= 0)
-				appendStringInfo(buf,  "catcache id%d ", msg->id);
+				appendStringInfo(buf, " catcache %d", msg->id);
+			else if (msg->id == SHAREDINVALCATALOG_ID)
+				appendStringInfo(buf, " catalog %u", msg->cat.catId);
 			else if (msg->id == SHAREDINVALRELCACHE_ID)
-				appendStringInfo(buf,  "relcache ");
+				appendStringInfo(buf, " relcache %u", msg->rc.relId);
+			/* remaining cases not expected, but print something anyway */
 			else if (msg->id == SHAREDINVALSMGR_ID)
-				appendStringInfo(buf,  "smgr ");
+				appendStringInfo(buf, " smgr");
+			else if (msg->id == SHAREDINVALRELMAP_ID)
+				appendStringInfo(buf, " relmap");
+			else
+				appendStringInfo(buf, " unknown id %d", msg->id);
 		}
 	}
 }
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 49adda12f9..f4b03f4c1b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.366 2010/02/01 13:40:28 sriggs Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.367 2010/02/07 20:48:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -53,6 +53,7 @@
 #include "utils/builtins.h"
 #include "utils/guc.h"
 #include "utils/ps_status.h"
+#include "utils/relmapper.h"
 #include "pg_trace.h"
 
 
@@ -7123,6 +7124,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 	CheckPointCLOG();
 	CheckPointSUBTRANS();
 	CheckPointMultiXact();
+	CheckPointRelationMap();
 	CheckPointBuffers(flags);	/* performs all required fsyncs */
 	/* We deliberately delay 2PC checkpointing as long as possible */
 	CheckPointTwoPhase(checkPointRedo);
diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y
index a6c1243b95..9cc68501ff 100644
--- a/src/backend/bootstrap/bootparse.y
+++ b/src/backend/bootstrap/bootparse.y
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootparse.y,v 1.104 2010/01/28 23:21:11 petere Exp $
+ *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootparse.y,v 1.105 2010/02/07 20:48:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -185,11 +185,26 @@ Boot_CreateStmt:
 		  RPAREN
 				{
 					TupleDesc tupdesc;
+					bool	shared_relation;
+					bool	mapped_relation;
 
 					do_start();
 
 					tupdesc = CreateTupleDesc(numattr, !($6), attrtypes);
 
+					shared_relation = $5;
+
+					/*
+					 * The catalogs that use the relation mapper are the
+					 * bootstrap catalogs plus the shared catalogs.  If this
+					 * ever gets more complicated, we should invent a BKI
+					 * keyword to mark the mapped catalogs, but for now a
+					 * quick hack seems the most appropriate thing.  Note in
+					 * particular that all "nailed" heap rels (see formrdesc
+					 * in relcache.c) must be mapped.
+					 */
+					mapped_relation = ($4 || shared_relation);
+
 					if ($4)
 					{
 						if (boot_reldesc)
@@ -200,11 +215,12 @@ Boot_CreateStmt:
 
 						boot_reldesc = heap_create($2,
 												   PG_CATALOG_NAMESPACE,
-												   $5 ? GLOBALTABLESPACE_OID : 0,
+												   shared_relation ? GLOBALTABLESPACE_OID : 0,
 												   $3,
 												   tupdesc,
 												   RELKIND_RELATION,
-												   $5,
+												   shared_relation,
+												   mapped_relation,
 												   true);
 						elog(DEBUG4, "bootstrap relation created");
 					}
@@ -214,7 +230,7 @@ Boot_CreateStmt:
 
 						id = heap_create_with_catalog($2,
 													  PG_CATALOG_NAMESPACE,
-													  $5 ? GLOBALTABLESPACE_OID : 0,
+													  shared_relation ? GLOBALTABLESPACE_OID : 0,
 													  $3,
 													  $7,
 													  InvalidOid,
@@ -222,7 +238,8 @@ Boot_CreateStmt:
 													  tupdesc,
 													  NIL,
 													  RELKIND_RELATION,
-													  $5,
+													  shared_relation,
+													  mapped_relation,
 													  true,
 													  0,
 													  ONCOMMIT_NOOP,
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 14e4b839e4..d2b7c1e585 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.258 2010/01/22 16:40:18 rhaas Exp $
+ *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.259 2010/02/07 20:48:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -42,6 +42,7 @@
 #include "utils/fmgroids.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
+#include "utils/relmapper.h"
 #include "utils/tqual.h"
 
 extern int	optind;
@@ -491,6 +492,12 @@ BootstrapModeMain(void)
 	 */
 	boot_yyparse();
 
+	/*
+	 * We should now know about all mapped relations, so it's okay to
+	 * write out the initial relation mapping files.
+	 */
+	RelationMapFinishBootstrap();
+
 	/* Perform a checkpoint to ensure everything's down to disk */
 	SetProcessingMode(NormalProcessing);
 	CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c
index 1ce2f85510..943cc4920e 100644
--- a/src/backend/catalog/catalog.c
+++ b/src/backend/catalog/catalog.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/catalog.c,v 1.87 2010/01/12 02:42:51 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/catalog.c,v 1.88 2010/02/07 20:48:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -460,16 +460,16 @@ GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn)
  * created by bootstrap have preassigned OIDs, so there's no need.
  */
 Oid
-GetNewRelFileNode(Oid reltablespace, bool relisshared, Relation pg_class)
+GetNewRelFileNode(Oid reltablespace, Relation pg_class)
 {
 	RelFileNode rnode;
 	char	   *rpath;
 	int			fd;
 	bool		collides;
 
-	/* This should match RelationInitPhysicalAddr */
+	/* This logic should match RelationInitPhysicalAddr */
 	rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace;
-	rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId;
+	rnode.dbNode = (rnode.spcNode == GLOBALTABLESPACE_OID) ? InvalidOid : MyDatabaseId;
 
 	do
 	{
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index c344b8e01c..bc232cd143 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/heap.c,v 1.369 2010/02/03 01:14:16 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/heap.c,v 1.370 2010/02/07 20:48:09 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -237,6 +237,7 @@ heap_create(const char *relname,
 			TupleDesc tupDesc,
 			char relkind,
 			bool shared_relation,
+			bool mapped_relation,
 			bool allow_system_table_mods)
 {
 	bool		create_storage;
@@ -307,7 +308,8 @@ heap_create(const char *relname,
 									 tupDesc,
 									 relid,
 									 reltablespace,
-									 shared_relation);
+									 shared_relation,
+									 mapped_relation);
 
 	/*
 	 * Have the storage manager create the relation's disk file, if needed.
@@ -364,7 +366,8 @@ heap_create(const char *relname,
  * --------------------------------
  */
 void
-CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind)
+CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind,
+						 bool allow_system_table_mods)
 {
 	int			i;
 	int			j;
@@ -418,7 +421,8 @@ CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind)
 	for (i = 0; i < natts; i++)
 	{
 		CheckAttributeType(NameStr(tupdesc->attrs[i]->attname),
-						   tupdesc->attrs[i]->atttypid);
+						   tupdesc->attrs[i]->atttypid,
+						   allow_system_table_mods);
 	}
 }
 
@@ -431,7 +435,8 @@ CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind)
  * --------------------------------
  */
 void
-CheckAttributeType(const char *attname, Oid atttypid)
+CheckAttributeType(const char *attname, Oid atttypid,
+				   bool allow_system_table_mods)
 {
 	char		att_typtype = get_typtype(atttypid);
 
@@ -450,9 +455,11 @@ CheckAttributeType(const char *attname, Oid atttypid)
 	{
 		/*
 		 * Refuse any attempt to create a pseudo-type column, except for a
-		 * special hack for pg_statistic: allow ANYARRAY during initdb
+		 * special hack for pg_statistic: allow ANYARRAY when modifying
+		 * system catalogs (this allows creating pg_statistic and cloning it
+		 * during VACUUM FULL)
 		 */
-		if (atttypid != ANYARRAYOID || IsUnderPostmaster)
+		if (atttypid != ANYARRAYOID || !allow_system_table_mods)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_TABLE_DEFINITION),
 					 errmsg("column \"%s\" has pseudo-type %s",
@@ -479,7 +486,8 @@ CheckAttributeType(const char *attname, Oid atttypid)
 
 			if (attr->attisdropped)
 				continue;
-			CheckAttributeType(NameStr(attr->attname), attr->atttypid);
+			CheckAttributeType(NameStr(attr->attname), attr->atttypid,
+							   allow_system_table_mods);
 		}
 
 		relation_close(relation, AccessShareLock);
@@ -865,6 +873,7 @@ AddNewRelationType(const char *typeName,
  *	cooked_constraints: list of precooked check constraints and defaults
  *	relkind: relkind for new rel
  *	shared_relation: TRUE if it's to be a shared relation
+ *	mapped_relation: TRUE if the relation will use the relfilenode map
  *	oidislocal: TRUE if oid column (if any) should be marked attislocal
  *	oidinhcount: attinhcount to assign to oid column (if any)
  *	oncommit: ON COMMIT marking (only relevant if it's a temp table)
@@ -888,6 +897,7 @@ heap_create_with_catalog(const char *relname,
 						 List *cooked_constraints,
 						 char relkind,
 						 bool shared_relation,
+						 bool mapped_relation,
 						 bool oidislocal,
 						 int oidinhcount,
 						 OnCommitAction oncommit,
@@ -909,7 +919,7 @@ heap_create_with_catalog(const char *relname,
 	 */
 	Assert(IsNormalProcessingMode() || IsBootstrapProcessingMode());
 
-	CheckAttributeNamesTypes(tupdesc, relkind);
+	CheckAttributeNamesTypes(tupdesc, relkind, allow_system_table_mods);
 
 	if (get_relname_relid(relname, relnamespace))
 		ereport(ERROR,
@@ -938,23 +948,10 @@ heap_create_with_catalog(const char *relname,
 	}
 
 	/*
-	 * Validate shared/non-shared tablespace (must check this before doing
-	 * GetNewRelFileNode, to prevent Assert therein)
+	 * Shared relations must be in pg_global (last-ditch check)
 	 */
-	if (shared_relation)
-	{
-		if (reltablespace != GLOBALTABLESPACE_OID)
-			/* elog since this is not a user-facing error */
-			elog(ERROR,
-				 "shared relations must be placed in pg_global tablespace");
-	}
-	else
-	{
-		if (reltablespace == GLOBALTABLESPACE_OID)
-			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("only shared relations can be placed in pg_global tablespace")));
-	}
+	if (shared_relation && reltablespace != GLOBALTABLESPACE_OID)
+		elog(ERROR, "shared relations must be placed in pg_global tablespace");
 
 	/*
 	 * Allocate an OID for the relation, unless we were told what to use.
@@ -979,8 +976,7 @@ heap_create_with_catalog(const char *relname,
 			binary_upgrade_next_toast_relfilenode = InvalidOid;
 		}
 		else
-			relid = GetNewRelFileNode(reltablespace, shared_relation,
-									  pg_class_desc);
+			relid = GetNewRelFileNode(reltablespace, pg_class_desc);
 	}
 
 	/*
@@ -1019,6 +1015,7 @@ heap_create_with_catalog(const char *relname,
 							   tupdesc,
 							   relkind,
 							   shared_relation,
+							   mapped_relation,
 							   allow_system_table_mods);
 
 	Assert(relid == RelationGetRelid(new_rel_desc));
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index c6b6e76933..e614d3baf6 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.332 2010/02/03 01:14:16 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.333 2010/02/07 20:48:09 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -111,6 +111,11 @@ static void validate_index_heapscan(Relation heapRelation,
 						Snapshot snapshot,
 						v_i_state *state);
 static Oid	IndexGetRelation(Oid indexId);
+static void SetReindexProcessing(Oid heapOid, Oid indexOid);
+static void ResetReindexProcessing(void);
+static void SetReindexPending(List *indexes);
+static void RemoveReindexPending(Oid indexOid);
+static void ResetReindexPending(void);
 
 
 /*
@@ -257,7 +262,7 @@ ConstructTupleDescriptor(Relation heapRelation,
 			 * whether a table column is of a safe type (which is why we
 			 * needn't check for the non-expression case).
 			 */
-			CheckAttributeType(NameStr(to->attname), to->atttypid);
+			CheckAttributeType(NameStr(to->attname), to->atttypid, false);
 		}
 
 		/*
@@ -544,6 +549,7 @@ index_create(Oid heapRelationId,
 	Relation	indexRelation;
 	TupleDesc	indexTupDesc;
 	bool		shared_relation;
+	bool		mapped_relation;
 	bool		is_exclusion;
 	Oid			namespaceId;
 	int			i;
@@ -562,10 +568,12 @@ index_create(Oid heapRelationId,
 
 	/*
 	 * The index will be in the same namespace as its parent table, and is
-	 * shared across databases if and only if the parent is.
+	 * shared across databases if and only if the parent is.  Likewise,
+	 * it will use the relfilenode map if and only if the parent does.
 	 */
 	namespaceId = RelationGetNamespace(heapRelation);
 	shared_relation = heapRelation->rd_rel->relisshared;
+	mapped_relation = RelationIsMapped(heapRelation);
 
 	/*
 	 * check parameters
@@ -609,23 +617,10 @@ index_create(Oid heapRelationId,
 				 errmsg("shared indexes cannot be created after initdb")));
 
 	/*
-	 * Validate shared/non-shared tablespace (must check this before doing
-	 * GetNewRelFileNode, to prevent Assert therein)
+	 * Shared relations must be in pg_global, too (last-ditch check)
 	 */
-	if (shared_relation)
-	{
-		if (tableSpaceId != GLOBALTABLESPACE_OID)
-			/* elog since this is not a user-facing error */
-			elog(ERROR,
-				 "shared relations must be placed in pg_global tablespace");
-	}
-	else
-	{
-		if (tableSpaceId == GLOBALTABLESPACE_OID)
-			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("only shared relations can be placed in pg_global tablespace")));
-	}
+	if (shared_relation && tableSpaceId != GLOBALTABLESPACE_OID)
+		elog(ERROR, "shared relations must be placed in pg_global tablespace");
 
 	if (get_relname_relid(indexRelationName, namespaceId))
 		ereport(ERROR,
@@ -657,8 +652,7 @@ index_create(Oid heapRelationId,
 			binary_upgrade_next_index_relfilenode = InvalidOid;
 		}
 		else
-			indexRelationId = GetNewRelFileNode(tableSpaceId, shared_relation,
-												pg_class);
+			indexRelationId = GetNewRelFileNode(tableSpaceId, pg_class);
 	}
 
 	/*
@@ -673,6 +667,7 @@ index_create(Oid heapRelationId,
 								indexTupDesc,
 								RELKIND_INDEX,
 								shared_relation,
+								mapped_relation,
 								allow_system_table_mods);
 
 	Assert(indexRelationId == RelationGetRelid(indexRelation));
@@ -2413,7 +2408,6 @@ reindex_index(Oid indexId)
 				heapRelation,
 				pg_index;
 	Oid			heapId;
-	bool		inplace;
 	IndexInfo  *indexInfo;
 	HeapTuple	indexTuple;
 	Form_pg_index indexForm;
@@ -2446,23 +2440,6 @@ reindex_index(Oid indexId)
 	 */
 	CheckTableNotInUse(iRel, "REINDEX INDEX");
 
-	/*
-	 * If it's a shared index, we must do inplace processing (because we have
-	 * no way to update relfilenode in other databases).  Otherwise we can do
-	 * it the normal transaction-safe way.
-	 *
-	 * Since inplace processing isn't crash-safe, we only allow it in a
-	 * standalone backend.	(In the REINDEX TABLE and REINDEX DATABASE cases,
-	 * the caller should have detected this.)
-	 */
-	inplace = iRel->rd_rel->relisshared;
-
-	if (inplace && IsUnderPostmaster)
-		ereport(ERROR,
-				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-				 errmsg("shared index \"%s\" can only be reindexed in stand-alone mode",
-						RelationGetRelationName(iRel))));
-
 	PG_TRY();
 	{
 		/* Suppress use of the target index while rebuilding it */
@@ -2471,20 +2448,8 @@ reindex_index(Oid indexId)
 		/* Fetch info needed for index_build */
 		indexInfo = BuildIndexInfo(iRel);
 
-		if (inplace)
-		{
-			/*
-			 * Truncate the actual file (and discard buffers).
-			 */
-			RelationTruncate(iRel, 0);
-		}
-		else
-		{
-			/*
-			 * We'll build a new physical relation for the index.
-			 */
-			RelationSetNewRelfilenode(iRel, InvalidTransactionId);
-		}
+		/* We'll build a new physical relation for the index */
+		RelationSetNewRelfilenode(iRel, InvalidTransactionId);
 
 		/* Initialize the index and rebuild */
 		/* Note: we do not need to re-establish pkey setting */
@@ -2538,19 +2503,27 @@ reindex_index(Oid indexId)
  * reindex_relation - This routine is used to recreate all indexes
  * of a relation (and optionally its toast relation too, if any).
  *
+ * If heap_rebuilt is true, then the relation was just completely rebuilt by
+ * an operation such as VACUUM FULL or CLUSTER, and therefore its indexes are
+ * inconsistent with it.  This makes things tricky if the relation is a system
+ * catalog that we might consult during the reindexing.  To deal with that
+ * case, we mark all of the indexes as pending rebuild so that they won't be
+ * trusted until rebuilt.  The caller is required to call us *without* having
+ * made the rebuilt versions visible by doing CommandCounterIncrement; we'll
+ * do CCI after having collected the index list.  (This way we can still use
+ * catalog indexes while collecting the list.)
+ *
  * Returns true if any indexes were rebuilt.  Note that a
  * CommandCounterIncrement will occur after each index rebuild.
  */
 bool
-reindex_relation(Oid relid, bool toast_too)
+reindex_relation(Oid relid, bool toast_too, bool heap_rebuilt)
 {
 	Relation	rel;
 	Oid			toast_relid;
+	List	   *indexIds;
 	bool		is_pg_class;
 	bool		result;
-	List	   *indexIds,
-			   *doneIndexes;
-	ListCell   *indexId;
 
 	/*
 	 * Open and lock the relation.	ShareLock is sufficient since we only need
@@ -2580,9 +2553,9 @@ reindex_relation(Oid relid, bool toast_too)
 	 * It is okay to not insert entries into the indexes we have not processed
 	 * yet because all of this is transaction-safe.  If we fail partway
 	 * through, the updated rows are dead and it doesn't matter whether they
-	 * have index entries.	Also, a new pg_class index will be created with an
-	 * entry for its own pg_class row because we do RelationSetNewRelfilenode()
-	 * before we do index_build().
+	 * have index entries.  Also, a new pg_class index will be created with a
+	 * correct entry for its own pg_class row because we do
+	 * RelationSetNewRelfilenode() before we do index_build().
 	 *
 	 * Note that we also clear pg_class's rd_oidindex until the loop is done,
 	 * so that that index can't be accessed either.  This means we cannot
@@ -2595,22 +2568,51 @@ reindex_relation(Oid relid, bool toast_too)
 	if (is_pg_class)
 		(void) RelationGetIndexAttrBitmap(rel);
 
-	/* Reindex all the indexes. */
-	doneIndexes = NIL;
-	foreach(indexId, indexIds)
+	PG_TRY();
 	{
-		Oid			indexOid = lfirst_oid(indexId);
+		List	   *doneIndexes;
+		ListCell   *indexId;
 
-		if (is_pg_class)
-			RelationSetIndexList(rel, doneIndexes, InvalidOid);
+		if (heap_rebuilt)
+		{
+			/* Suppress use of all the indexes until they are rebuilt */
+			SetReindexPending(indexIds);
 
-		reindex_index(indexOid);
+			/*
+			 * Make the new heap contents visible --- now things might be
+			 * inconsistent!
+			 */
+			CommandCounterIncrement();
+		}
 
-		CommandCounterIncrement();
+		/* Reindex all the indexes. */
+		doneIndexes = NIL;
+		foreach(indexId, indexIds)
+		{
+			Oid			indexOid = lfirst_oid(indexId);
+
+			if (is_pg_class)
+				RelationSetIndexList(rel, doneIndexes, InvalidOid);
 
-		if (is_pg_class)
-			doneIndexes = lappend_oid(doneIndexes, indexOid);
+			reindex_index(indexOid);
+
+			CommandCounterIncrement();
+
+			if (heap_rebuilt)
+				RemoveReindexPending(indexOid);
+
+			if (is_pg_class)
+				doneIndexes = lappend_oid(doneIndexes, indexOid);
+		}
+	}
+	PG_CATCH();
+	{
+		/* Make sure list gets cleared on error exit */
+		ResetReindexPending();
+		PG_RE_THROW();
 	}
+	PG_END_TRY();
+	ResetReindexPending();
 
 	if (is_pg_class)
 		RelationSetIndexList(rel, indexIds, ClassOidIndexId);
@@ -2627,7 +2629,107 @@ reindex_relation(Oid relid, bool toast_too)
 	 * still hold the lock on the master table.
 	 */
 	if (toast_too && OidIsValid(toast_relid))
-		result |= reindex_relation(toast_relid, false);
+		result |= reindex_relation(toast_relid, false, false);
 
 	return result;
 }
+
+
+/* ----------------------------------------------------------------
+ *		System index reindexing support
+ *
+ * When we are busy reindexing a system index, this code provides support
+ * for preventing catalog lookups from using that index.
+ * ----------------------------------------------------------------
+ */
+
+static Oid	currentlyReindexedHeap = InvalidOid;
+static Oid	currentlyReindexedIndex = InvalidOid;
+static List *pendingReindexedIndexes = NIL;
+
+/*
+ * ReindexIsProcessingHeap
+ *		True if heap specified by OID is currently being reindexed.
+ */
+bool
+ReindexIsProcessingHeap(Oid heapOid)
+{
+	return heapOid == currentlyReindexedHeap;
+}
+
+/*
+ * ReindexIsProcessingIndex
+ *		True if index specified by OID is currently being reindexed,
+ *		or should be treated as invalid because it is awaiting reindex.
+ */
+bool
+ReindexIsProcessingIndex(Oid indexOid)
+{
+	return indexOid == currentlyReindexedIndex ||
+		list_member_oid(pendingReindexedIndexes, indexOid);
+}
+
+/*
+ * SetReindexProcessing
+ *		Set flag that specified heap/index are being reindexed.
+ *
+ * NB: caller must use a PG_TRY block to ensure ResetReindexProcessing is done.
+ */
+static void
+SetReindexProcessing(Oid heapOid, Oid indexOid)
+{
+	Assert(OidIsValid(heapOid) && OidIsValid(indexOid));
+	/* Reindexing is not re-entrant. */
+	if (OidIsValid(currentlyReindexedHeap))
+		elog(ERROR, "cannot reindex while reindexing");
+	currentlyReindexedHeap = heapOid;
+	currentlyReindexedIndex = indexOid;
+}
+
+/*
+ * ResetReindexProcessing
+ *		Unset reindexing status.
+ */
+static void
+ResetReindexProcessing(void)
+{
+	currentlyReindexedHeap = InvalidOid;
+	currentlyReindexedIndex = InvalidOid;
+}
+
+/*
+ * SetReindexPending
+ *		Mark the given indexes as pending reindex.
+ *
+ * NB: caller must use a PG_TRY block to ensure ResetReindexPending is done.
+ * Also, we assume that the current memory context stays valid throughout.
+ */
+static void
+SetReindexPending(List *indexes)
+{
+	/* Reindexing is not re-entrant. */
+	if (pendingReindexedIndexes)
+		elog(ERROR, "cannot reindex while reindexing");
+	pendingReindexedIndexes = list_copy(indexes);
+}
+
+/*
+ * RemoveReindexPending
+ *		Remove the given index from the pending list.
+ */
+static void
+RemoveReindexPending(Oid indexOid)
+{
+	pendingReindexedIndexes = list_delete_oid(pendingReindexedIndexes,
+											  indexOid);
+}
+
+/*
+ * ResetReindexPending
+ *		Unset reindex-pending status.
+ */
+static void
+ResetReindexPending(void)
+{
+	pendingReindexedIndexes = NIL;
+}
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 492d55fbcf..e087b653b9 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/storage.c,v 1.7 2010/01/02 16:57:36 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/storage.c,v 1.8 2010/02/07 20:48:09 tgl Exp $
  *
  * NOTES
  *	  Some of this code used to be in storage/smgr/smgr.c, and the
@@ -109,8 +109,7 @@ RelationCreateStorage(RelFileNode rnode, bool istemp)
 	if (!istemp)
 	{
 		/*
-		 * Make an XLOG entry showing the file creation.  If we abort, the
-		 * file will be dropped at abort time.
+		 * Make an XLOG entry reporting the file creation.
 		 */
 		xlrec.rnode = rnode;
 
@@ -165,6 +164,52 @@ RelationDropStorage(Relation rel)
 	RelationCloseSmgr(rel);
 }
 
+/*
+ * RelationPreserveStorage
+ *		Mark a relation as not to be deleted after all.
+ *
+ * We need this function because relation mapping changes are committed
+ * separately from commit of the whole transaction, so it's still possible
+ * for the transaction to abort after the mapping update is done.
+ * When a new physical relation is installed in the map, it would be
+ * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
+ * The relation mapper fixes this by telling us to not delete such relations
+ * after all as part of its commit.
+ *
+ * No-op if the relation is not among those scheduled for deletion.
+ */
+void
+RelationPreserveStorage(RelFileNode rnode)
+{
+	PendingRelDelete *pending;
+	PendingRelDelete *prev;
+	PendingRelDelete *next;
+
+	prev = NULL;
+	for (pending = pendingDeletes; pending != NULL; pending = next)
+	{
+		next = pending->next;
+		if (RelFileNodeEquals(rnode, pending->relnode))
+		{
+			/* we should only find delete-on-abort entries, else trouble */
+			if (pending->atCommit)
+				elog(ERROR, "cannot preserve a delete-on-commit relation");
+			/* unlink and delete list entry */
+			if (prev)
+				prev->next = next;
+			else
+				pendingDeletes = next;
+			pfree(pending);
+			/* prev does not change */
+		}
+		else
+		{
+			/* unrelated entry, don't touch it */
+			prev = pending;
+		}
+	}
+}
+
 /*
  * RelationTruncate
  *		Physically truncate a relation to the specified number of blocks.
@@ -200,13 +245,13 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	 * likely isn't going to succeed in the truncation either, and cause a
 	 * PANIC. It's tempting to put a critical section here, but that cure
 	 * would be worse than the disease. It would turn a usually harmless
-	 * failure to truncate, that could spell trouble at WAL replay, into a
+	 * failure to truncate, that might spell trouble at WAL replay, into a
 	 * certain PANIC.
 	 */
 	if (!rel->rd_istemp)
 	{
 		/*
-		 * Make an XLOG entry showing the file truncation.
+		 * Make an XLOG entry reporting the file truncation.
 		 */
 		XLogRecPtr	lsn;
 		XLogRecData rdata;
@@ -270,10 +315,8 @@ smgrDoPendingDeletes(bool isCommit)
 			/* do deletion if called for */
 			if (pending->atCommit == isCommit)
 			{
-				int			i;
-
-				/* schedule unlinking old files */
 				SMgrRelation srel;
+				int			i;
 
 				srel = smgropen(pending->relnode);
 				for (i = 0; i <= MAX_FORKNUM; i++)
@@ -440,7 +483,6 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
 			FreeSpaceMapTruncateRel(rel, xlrec->blkno);
 			FreeFakeRelcacheEntry(rel);
 		}
-
 	}
 	else
 		elog(PANIC, "smgr_redo: unknown op code %u", info);
diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c
index ca70f19bf3..35bade50ea 100644
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.29 2010/02/03 01:14:16 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.30 2010/02/07 20:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -114,6 +114,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, Datum reloptio
 	HeapTuple	reltup;
 	TupleDesc	tupdesc;
 	bool		shared_relation;
+	bool		mapped_relation;
 	Relation	class_rel;
 	Oid			toast_relid;
 	Oid			toast_idxid;
@@ -139,6 +140,9 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, Datum reloptio
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				 errmsg("shared tables cannot be toasted after initdb")));
 
+	/* It's mapped if and only if its parent is, too */
+	mapped_relation = RelationIsMapped(rel);
+
 	/*
 	 * Is it already toasted?
 	 */
@@ -148,7 +152,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, Datum reloptio
 	/*
 	 * Check to see whether the table actually needs a TOAST table.
 	 *
-	 * If an update-in-place relfilenode is specified, force toast file
+	 * If an update-in-place toast relfilenode is specified, force toast file
 	 * creation even if it seems not to need one.
 	 */
 	if (!needs_toast_table(rel) &&
@@ -213,6 +217,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, Datum reloptio
 										   NIL,
 										   RELKIND_TOASTVALUE,
 										   shared_relation,
+										   mapped_relation,
 										   true,
 										   0,
 										   ONCOMMIT_NOOP,
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index cf2ac19d53..da605bffac 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * cluster.c
- *	  CLUSTER a table on an index.
+ *	  CLUSTER a table on an index.  This is now also used for VACUUM FULL.
  *
  * There is hardly anything left of Paul Brown's original implementation...
  *
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/cluster.c,v 1.197 2010/02/04 00:09:14 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/cluster.c,v 1.198 2010/02/07 20:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -44,6 +44,7 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/relcache.h"
+#include "utils/relmapper.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/tqual.h"
@@ -223,7 +224,8 @@ cluster(ClusterStmt *stmt, bool isTopLevel)
 			StartTransactionCommand();
 			/* functions in indexes may want a snapshot set */
 			PushActiveSnapshot(GetTransactionSnapshot());
-			cluster_rel(rvtc->tableOid, rvtc->indexOid, true, stmt->verbose, -1, -1);
+			cluster_rel(rvtc->tableOid, rvtc->indexOid, true, stmt->verbose,
+						-1, -1);
 			PopActiveSnapshot();
 			CommitTransactionCommand();
 		}
@@ -245,13 +247,13 @@ cluster(ClusterStmt *stmt, bool isTopLevel)
  * GRANT, inheritance nor references to this table (this was a bug
  * in releases thru 7.3).
  *
- * Also create new indexes and swap the filenodes with the old indexes the
- * same way we do for the relation.  Since we are effectively bulk-loading
+ * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
  * the new table, it's better to create the indexes afterwards than to fill
  * them incrementally while we load the table.
  *
  * If indexOid is InvalidOid, the table will be rewritten in physical order
- * instead of index order.
+ * instead of index order.  This is the new implementation of VACUUM FULL,
+ * and error messages should refer to the operation as VACUUM not CLUSTER.
  */
 void
 cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose,
@@ -300,8 +302,7 @@ cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose,
 		 * somebody is executing a database-wide CLUSTER), because there is
 		 * another check in cluster() which will stop any attempt to cluster
 		 * remote temp tables by name.	There is another check in
-		 * check_index_is_clusterable which is redundant, but we leave it for
-		 * extra safety.
+		 * cluster_rel which is redundant, but we leave it for extra safety.
 		 */
 		if (RELATION_IS_OTHER_TEMP(OldHeap))
 		{
@@ -344,10 +345,44 @@ cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose,
 		}
 	}
 
+	/*
+	 * We allow VACUUM FULL, but not CLUSTER, on shared catalogs.  CLUSTER
+	 * would work in most respects, but the index would only get marked as
+	 * indisclustered in the current database, leading to unexpected behavior
+	 * if CLUSTER were later invoked in another database.
+	 */
+	if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot cluster a shared catalog")));
+
+	/*
+	 * Don't process temp tables of other backends ... their local
+	 * buffer manager is not going to cope.
+	 */
+	if (RELATION_IS_OTHER_TEMP(OldHeap))
+	{
+		if (OidIsValid(indexOid))
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot cluster temporary tables of other sessions")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot vacuum temporary tables of other sessions")));
+	}
+
+	/*
+	 * Also check for active uses of the relation in the current transaction,
+	 * including open scans and pending AFTER trigger events.
+	 */
+	CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
+
 	/* Check heap and index are valid to cluster on */
-	check_index_is_clusterable(OldHeap, indexOid, recheck);
+	if (OidIsValid(indexOid))
+		check_index_is_clusterable(OldHeap, indexOid, recheck);
 
-	/* rebuild_relation does all the dirty work */
+	/* Log what we're doing (this could use more effort) */
 	if (OidIsValid(indexOid))
 		ereport(verbose ? INFO : DEBUG2,
 				(errmsg("clustering \"%s.%s\"",
@@ -358,6 +393,8 @@ cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose,
 				(errmsg("vacuuming \"%s.%s\"",
 						get_namespace_name(RelationGetNamespace(OldHeap)),
 						RelationGetRelationName(OldHeap))));
+
+	/* rebuild_relation does all the dirty work */
 	rebuild_relation(OldHeap, indexOid, freeze_min_age, freeze_table_age);
 
 	/* NB: rebuild_relation does heap_close() on OldHeap */
@@ -376,38 +413,6 @@ check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck)
 {
 	Relation	OldIndex;
 
-	/*
-	 * Disallow clustering system relations.  This will definitely NOT work
-	 * for shared relations (we have no way to update pg_class rows in other
-	 * databases), nor for nailed-in-cache relations (the relfilenode values
-	 * for those are hardwired, see relcache.c).  It might work for other
-	 * system relations, but I ain't gonna risk it.
-	 */
-	if (IsSystemRelation(OldHeap))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("\"%s\" is a system catalog",
-						RelationGetRelationName(OldHeap))));
-
-	/*
-	 * Don't allow cluster on temp tables of other backends ... their local
-	 * buffer manager is not going to cope.
-	 */
-	if (RELATION_IS_OTHER_TEMP(OldHeap))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-			   errmsg("cannot cluster temporary tables of other sessions")));
-
-	/*
-	 * Also check for active uses of the relation in the current transaction,
-	 * including open scans and pending AFTER trigger events.
-	 */
-	CheckTableNotInUse(OldHeap, "CLUSTER");
-
-	/* Skip checks for index if not specified. */
-	if (!OidIsValid(indexOid))
-		return;
-
 	OldIndex = index_open(indexOid, AccessExclusiveLock);
 
 	/*
@@ -421,6 +426,13 @@ check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck)
 						RelationGetRelationName(OldIndex),
 						RelationGetRelationName(OldHeap))));
 
+	/* Index AM must allow clustering */
+	if (!OldIndex->rd_am->amclusterable)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
+						RelationGetRelationName(OldIndex))));
+
 	/*
 	 * Disallow clustering on incomplete indexes (those that might not index
 	 * every row of the relation).	We could relax this by making a separate
@@ -433,12 +445,6 @@ check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck)
 				 errmsg("cannot cluster on partial index \"%s\"",
 						RelationGetRelationName(OldIndex))));
 
-	if (!OldIndex->rd_am->amclusterable)
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
-						RelationGetRelationName(OldIndex))));
-
 	if (!OldIndex->rd_am->amindexnulls)
 	{
 		AttrNumber	colno;
@@ -585,6 +591,7 @@ rebuild_relation(Relation OldHeap, Oid indexOid,
 	Oid			tableOid = RelationGetRelid(OldHeap);
 	Oid			tableSpace = OldHeap->rd_rel->reltablespace;
 	Oid			OIDNewHeap;
+	bool		is_system_catalog;
 	bool		swap_toast_by_content;
 	TransactionId frozenXid;
 
@@ -592,6 +599,9 @@ rebuild_relation(Relation OldHeap, Oid indexOid,
 	if (OidIsValid(indexOid))
 		mark_index_clustered(OldHeap, indexOid);
 
+	/* Remember if it's a system catalog */
+	is_system_catalog = IsSystemRelation(OldHeap);
+
 	/* Close relcache entry, but keep lock until transaction commit */
 	heap_close(OldHeap, NoLock);
 
@@ -603,12 +613,12 @@ rebuild_relation(Relation OldHeap, Oid indexOid,
 				   freeze_min_age, freeze_table_age,
 				   &swap_toast_by_content, &frozenXid);
 
-	/* Swap the physical files of the old and new heaps */
-	swap_relation_files(tableOid, OIDNewHeap,
-						swap_toast_by_content, frozenXid);
-
-	/* Destroy the new heap, removing the old data along with it */
-	cleanup_heap_swap(tableOid, OIDNewHeap, swap_toast_by_content);
+	/*
+	 * Swap the physical files of the target and transient tables, then
+	 * rebuild the target's indexes and throw away the transient table.
+	 */
+	finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
+					 swap_toast_by_content, frozenXid);
 }
 
 
@@ -619,8 +629,7 @@ rebuild_relation(Relation OldHeap, Oid indexOid,
  * NewTableSpace which might be different from OldHeap's.
  *
  * After this, the caller should load the new heap with transferred/modified
- * data, then call swap_relation_files, and finally call cleanup_heap_swap to
- * remove the debris.
+ * data, then call finish_heap_swap to complete the operation.
  */
 Oid
 make_new_heap(Oid OIDOldHeap, Oid NewTableSpace)
@@ -666,6 +675,11 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace)
 	 * relnames.  Working around this seems more trouble than it's worth; in
 	 * particular, we can't create the new heap in a different namespace from
 	 * the old, or we will have problems with the TEMP status of temp tables.
+	 *
+	 * Note: the new heap is not a shared relation, even if we are rebuilding
+	 * a shared rel.  However, we do make the new heap mapped if the source
+	 * is mapped.  This simplifies swap_relation_files, and is absolutely
+	 * necessary for rebuilding pg_class, for reasons explained there.
 	 */
 	snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
 
@@ -679,13 +693,14 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace)
 										  tupdesc,
 										  NIL,
 										  OldHeap->rd_rel->relkind,
-										  OldHeap->rd_rel->relisshared,
+										  false,
+										  RelationIsMapped(OldHeap),
 										  true,
 										  0,
 										  ONCOMMIT_NOOP,
 										  reloptions,
 										  false,
-										  allowSystemTableMods);
+										  true);
 
 	ReleaseSysCache(tuple);
 
@@ -696,14 +711,20 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace)
 	CommandCounterIncrement();
 
 	/*
-	 * If necessary, create a TOAST table for the new relation. Note that
-	 * AlterTableCreateToastTable ends with CommandCounterIncrement(), so that
-	 * the TOAST table will be visible for insertion.
+	 * If necessary, create a TOAST table for the new relation.
+	 *
+	 * If the relation doesn't have a TOAST table already, we can't need one
+	 * for the new relation.  The other way around is possible though: if
+	 * some wide columns have been dropped, AlterTableCreateToastTable
+	 * can decide that no TOAST table is needed for the new table.
+	 *
+	 * Note that AlterTableCreateToastTable ends with CommandCounterIncrement,
+	 * so that the TOAST table will be visible for insertion.
 	 */
 	toastid = OldHeap->rd_rel->reltoastrelid;
-	reloptions = (Datum) 0;
 	if (OidIsValid(toastid))
 	{
+		/* keep the existing toast table's reloptions, if any */
 		tuple = SearchSysCache(RELOID,
 							   ObjectIdGetDatum(toastid),
 							   0, 0, 0);
@@ -713,11 +734,11 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace)
 									 &isNull);
 		if (isNull)
 			reloptions = (Datum) 0;
-	}
-	AlterTableCreateToastTable(OIDNewHeap, reloptions);
 
-	if (OidIsValid(toastid))
+		AlterTableCreateToastTable(OIDNewHeap, reloptions);
+
 		ReleaseSysCache(tuple);
+	}
 
 	heap_close(OldHeap, NoLock);
 
@@ -747,6 +768,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 	IndexScanDesc indexScan;
 	HeapScanDesc heapScan;
 	bool		use_wal;
+	bool		is_system_catalog;
 	TransactionId OldestXmin;
 	TransactionId FreezeXid;
 	RewriteState rwstate;
@@ -786,9 +808,14 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 	 */
 	if (!use_wal && !NewHeap->rd_istemp)
 	{
-		char reason[NAMEDATALEN + 20];
-		snprintf(reason, sizeof(reason), "CLUSTER on \"%s\"",
-				 RelationGetRelationName(NewHeap));
+		char reason[NAMEDATALEN + 32];
+
+		if (OldIndex != NULL)
+			snprintf(reason, sizeof(reason), "CLUSTER on \"%s\"",
+					 RelationGetRelationName(NewHeap));
+		else
+			snprintf(reason, sizeof(reason), "VACUUM FULL on \"%s\"",
+					 RelationGetRelationName(NewHeap));
 		XLogReportUnloggedStatement(reason);
 	}
 
@@ -841,6 +868,9 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 	/* return selected value to caller */
 	*pFreezeXid = FreezeXid;
 
+	/* Remember if it's a system catalog */
+	is_system_catalog = IsSystemRelation(OldHeap);
+
 	/* Initialize the rewrite operation */
 	rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, use_wal);
 
@@ -909,25 +939,31 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 			case HEAPTUPLE_INSERT_IN_PROGRESS:
 
 				/*
-				 * We should not see this unless it's been inserted earlier in
-				 * our own transaction.
+				 * Since we hold exclusive lock on the relation, normally
+				 * the only way to see this is if it was inserted earlier
+				 * in our own transaction.  However, it can happen in system
+				 * catalogs, since we tend to release write lock before commit
+				 * there.  Give a warning if neither case applies; but in
+				 * any case we had better copy it.
 				 */
-				if (!TransactionIdIsCurrentTransactionId(
-									  HeapTupleHeaderGetXmin(tuple->t_data)))
-					elog(ERROR, "concurrent insert in progress");
+				if (!is_system_catalog &&
+					!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
+
+					elog(WARNING, "concurrent insert in progress within table \"%s\"",
+						 RelationGetRelationName(OldHeap));
 				/* treat as live */
 				isdead = false;
 				break;
 			case HEAPTUPLE_DELETE_IN_PROGRESS:
 
 				/*
-				 * We should not see this unless it's been deleted earlier in
-				 * our own transaction.
+				 * Similar situation to INSERT_IN_PROGRESS case.
 				 */
 				Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
-				if (!TransactionIdIsCurrentTransactionId(
-									  HeapTupleHeaderGetXmax(tuple->t_data)))
-					elog(ERROR, "concurrent delete in progress");
+				if (!is_system_catalog &&
+					!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple->t_data)))
+					elog(WARNING, "concurrent delete in progress within table \"%s\"",
+						 RelationGetRelationName(OldHeap));
 				/* treat as recently dead */
 				isdead = false;
 				break;
@@ -1016,21 +1052,29 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
  * table is added or removed altogether.
  *
  * Additionally, the first relation is marked with relfrozenxid set to
- * frozenXid.  It seems a bit ugly to have this here, but all callers would
+ * frozenXid.  It seems a bit ugly to have this here, but the caller would
  * have to do it anyway, so having it here saves a heap_update.  Note: in
  * the swap-toast-links case, we assume we don't need to change the toast
  * table's relfrozenxid: the new version of the toast table should already
  * have relfrozenxid set to RecentXmin, which is good enough.
+ *
+ * Lastly, if r2 and its toast table and toast index (if any) are mapped,
+ * their OIDs are emitted into mapped_tables[].  This is hacky but beats
+ * having to look the information up again later in finish_heap_swap.
  */
-void
-swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content,
-					TransactionId frozenXid)
+static void
+swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
+					bool swap_toast_by_content,
+					TransactionId frozenXid,
+					Oid *mapped_tables)
 {
 	Relation	relRelation;
 	HeapTuple	reltup1,
 				reltup2;
 	Form_pg_class relform1,
 				relform2;
+	Oid			relfilenode1,
+				relfilenode2;
 	Oid			swaptemp;
 	CatalogIndexState indstate;
 
@@ -1051,29 +1095,86 @@ swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content,
 		elog(ERROR, "cache lookup failed for relation %u", r2);
 	relform2 = (Form_pg_class) GETSTRUCT(reltup2);
 
-	/*
-	 * Actually swap the fields in the two tuples
-	 */
-	swaptemp = relform1->relfilenode;
-	relform1->relfilenode = relform2->relfilenode;
-	relform2->relfilenode = swaptemp;
+	relfilenode1 = relform1->relfilenode;
+	relfilenode2 = relform2->relfilenode;
 
-	swaptemp = relform1->reltablespace;
-	relform1->reltablespace = relform2->reltablespace;
-	relform2->reltablespace = swaptemp;
+	if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
+	{
+		/* Normal non-mapped relations: swap relfilenodes and reltablespaces */
+		Assert(!target_is_pg_class);
 
-	if (!swap_toast_by_content)
+		swaptemp = relform1->relfilenode;
+		relform1->relfilenode = relform2->relfilenode;
+		relform2->relfilenode = swaptemp;
+
+		swaptemp = relform1->reltablespace;
+		relform1->reltablespace = relform2->reltablespace;
+		relform2->reltablespace = swaptemp;
+
+		/* Also swap toast links, if we're swapping by links */
+		if (!swap_toast_by_content)
+		{
+			swaptemp = relform1->reltoastrelid;
+			relform1->reltoastrelid = relform2->reltoastrelid;
+			relform2->reltoastrelid = swaptemp;
+
+			/* we should NOT swap reltoastidxid */
+		}
+	}
+	else
 	{
-		swaptemp = relform1->reltoastrelid;
-		relform1->reltoastrelid = relform2->reltoastrelid;
-		relform2->reltoastrelid = swaptemp;
+		/*
+		 * Mapped-relation case.  Here we have to swap the relation mappings
+		 * instead of modifying the pg_class columns.  Both must be mapped.
+		 */
+		if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
+			elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
+				 NameStr(relform1->relname));
+
+		/*
+		 * We can't change the tablespace of a mapped rel, and we can't handle
+		 * toast link swapping for one either, because we must not apply any
+		 * critical changes to its pg_class row.  These cases should be
+		 * prevented by upstream permissions tests, so this check is a
+		 * non-user-facing emergency backstop.
+		 */
+		if (relform1->reltablespace != relform2->reltablespace)
+			elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
+				 NameStr(relform1->relname));
+		if (!swap_toast_by_content &&
+			(relform1->reltoastrelid || relform2->reltoastrelid))
+			elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
+				 NameStr(relform1->relname));
 
-		/* we should not swap reltoastidxid */
+		/*
+		 * Fetch the mappings --- shouldn't fail, but be paranoid
+		 */
+		relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
+		if (!OidIsValid(relfilenode1))
+			elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
+				 NameStr(relform1->relname), r1);
+		relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
+		if (!OidIsValid(relfilenode2))
+			elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
+				 NameStr(relform2->relname), r2);
+
+		/*
+		 * Send replacement mappings to relmapper.  Note these won't actually
+		 * take effect until CommandCounterIncrement.
+		 */
+		RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
+		RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
+
+		/* Pass OIDs of mapped r2 tables back to caller */
+		*mapped_tables++ = r2;
 	}
 
 	/*
-	 * In the case of a shared catalog, these next few steps only affect our
-	 * own database's pg_class row; but that's okay.
+	 * In the case of a shared catalog, these next few steps will only affect
+	 * our own database's pg_class row; but that's okay, because they are
+	 * all noncritical updates.  That's also an important fact for the case
+	 * of a mapped catalog, because it's possible that we'll commit the map
+	 * change and then fail to commit the pg_class update.
 	 */
 
 	/* set rel1's frozen Xid */
@@ -1097,15 +1198,31 @@ swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content,
 		relform2->reltuples = swap_tuples;
 	}
 
-	/* Update the tuples in pg_class */
-	simple_heap_update(relRelation, &reltup1->t_self, reltup1);
-	simple_heap_update(relRelation, &reltup2->t_self, reltup2);
-
-	/* Keep system catalogs current */
-	indstate = CatalogOpenIndexes(relRelation);
-	CatalogIndexInsert(indstate, reltup1);
-	CatalogIndexInsert(indstate, reltup2);
-	CatalogCloseIndexes(indstate);
+	/*
+	 * Update the tuples in pg_class --- unless the target relation of the
+	 * swap is pg_class itself.  In that case, there is zero point in making
+	 * changes because we'd be updating the old data that we're about to
+	 * throw away.  Because the real work being done here for a mapped relation
+	 * is just to change the relation map settings, it's all right to not
+	 * update the pg_class rows in this case.
+	 */
+	if (!target_is_pg_class)
+	{
+		simple_heap_update(relRelation, &reltup1->t_self, reltup1);
+		simple_heap_update(relRelation, &reltup2->t_self, reltup2);
+
+		/* Keep system catalogs current */
+		indstate = CatalogOpenIndexes(relRelation);
+		CatalogIndexInsert(indstate, reltup1);
+		CatalogIndexInsert(indstate, reltup2);
+		CatalogCloseIndexes(indstate);
+	}
+	else
+	{
+		/* no update ... but we do still need relcache inval */
+		CacheInvalidateRelcacheByTuple(reltup1);
+		CacheInvalidateRelcacheByTuple(reltup2);
+	}
 
 	/*
 	 * If we have toast tables associated with the relations being swapped,
@@ -1120,8 +1237,10 @@ swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content,
 				/* Recursively swap the contents of the toast tables */
 				swap_relation_files(relform1->reltoastrelid,
 									relform2->reltoastrelid,
-									true,
-									frozenXid);
+									target_is_pg_class,
+									swap_toast_by_content,
+									frozenXid,
+									mapped_tables);
 			}
 			else
 			{
@@ -1146,6 +1265,15 @@ swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content,
 						toastobject;
 			long		count;
 
+			/*
+			 * We disallow this case for system catalogs, to avoid the
+			 * possibility that the catalog we're rebuilding is one of the
+			 * ones the dependency changes would change.  It's too late
+			 * to be making any data changes to the target catalog.
+			 */
+			if (IsSystemClass(relform1))
+				elog(ERROR, "cannot swap toast files by links for system catalogs");
+
 			/* Delete old dependencies */
 			if (relform1->reltoastrelid)
 			{
@@ -1196,30 +1324,35 @@ swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content,
 		relform1->reltoastidxid && relform2->reltoastidxid)
 			swap_relation_files(relform1->reltoastidxid,
 								relform2->reltoastidxid,
-								true,
-								InvalidTransactionId);
-
-	/*
-	 * Blow away the old relcache entries now.	We need this kluge because
-	 * relcache.c keeps a link to the smgr relation for the physical file, and
-	 * that will be out of date as soon as we do CommandCounterIncrement.
-	 * Whichever of the rels is the second to be cleared during cache
-	 * invalidation will have a dangling reference to an already-deleted smgr
-	 * relation.  Rather than trying to avoid this by ordering operations just
-	 * so, it's easiest to not have the relcache entries there at all.
-	 * (Fortunately, since one of the entries is local in our transaction,
-	 * it's sufficient to clear out our own relcache this way; the problem
-	 * cannot arise for other backends when they see our update on the
-	 * non-local relation.)
-	 */
-	RelationForgetRelation(r1);
-	RelationForgetRelation(r2);
+								target_is_pg_class,
+								swap_toast_by_content,
+								InvalidTransactionId,
+								mapped_tables);
 
 	/* Clean up. */
 	heap_freetuple(reltup1);
 	heap_freetuple(reltup2);
 
 	heap_close(relRelation, RowExclusiveLock);
+
+	/*
+	 * Close both relcache entries' smgr links.  We need this kluge because
+	 * both links will be invalidated during upcoming CommandCounterIncrement.
+	 * Whichever of the rels is the second to be cleared will have a dangling
+	 * reference to the other's smgr entry.  Rather than trying to avoid this
+	 * by ordering operations just so, it's easiest to close the links first.
+	 * (Fortunately, since one of the entries is local in our transaction,
+	 * it's sufficient to clear out our own relcache this way; the problem
+	 * cannot arise for other backends when they see our update on the
+	 * non-transient relation.)
+	 *
+	 * Caution: the placement of this step interacts with the decision to
+	 * handle toast rels by recursion.  When we are trying to rebuild pg_class
+	 * itself, the smgr close on pg_class must happen after all accesses in
+	 * this function.
+	 */
+	RelationCloseSmgrByOid(r1);
+	RelationCloseSmgrByOid(r2);
 }
 
 /*
@@ -1227,12 +1360,43 @@ swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content,
  * cleaning up (including rebuilding all indexes on the old heap).
  */
 void
-cleanup_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, bool swap_toast_by_content)
+finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
+				 bool is_system_catalog,
+				 bool swap_toast_by_content,
+				 TransactionId frozenXid)
 {
 	ObjectAddress object;
+	Oid			mapped_tables[4];
+	int			i;
 
-	/* Make swap_relation_files' changes visible in the catalogs. */
-	CommandCounterIncrement();
+	/* Zero out possible results from swapped_relation_files */
+	memset(mapped_tables, 0, sizeof(mapped_tables));
+
+	/*
+	 * Swap the contents of the heap relations (including any toast tables).
+	 * Also set old heap's relfrozenxid to frozenXid.
+	 */
+	swap_relation_files(OIDOldHeap, OIDNewHeap,
+						(OIDOldHeap == RelationRelationId),
+						swap_toast_by_content, frozenXid, mapped_tables);
+
+	/*
+	 * If it's a system catalog, queue an sinval message to flush all
+	 * catcaches on the catalog when we reach CommandCounterIncrement.
+	 */
+	if (is_system_catalog)
+		CacheInvalidateCatalog(OIDOldHeap);
+
+	/*
+	 * Rebuild each index on the relation (but not the toast table, which is
+	 * all-new at this point).  It is important to do this before the DROP
+	 * step because if we are processing a system catalog that will be used
+	 * during DROP, we want to have its indexes available.  There is no
+	 * advantage to the other order anyway because this is all transactional,
+	 * so no chance to reclaim disk space before commit.  We do not need
+	 * a final CommandCounterIncrement() because reindex_relation does it.
+	 */
+	reindex_relation(OIDOldHeap, false, true);
 
 	/* Destroy new heap with old filenode */
 	object.classId = RelationRelationId;
@@ -1248,11 +1412,13 @@ cleanup_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, bool swap_toast_by_content)
 	/* performDeletion does CommandCounterIncrement at end */
 
 	/*
-	 * Rebuild each index on the relation (but not the toast table, which is
-	 * all-new at this point).	We do not need CommandCounterIncrement()
-	 * because reindex_relation does it.
+	 * Now we must remove any relation mapping entries that we set up for the
+	 * transient table, as well as its toast table and toast index if any.
+	 * If we fail to do this before commit, the relmapper will complain about
+	 * new permanent map entries being added post-bootstrap.
 	 */
-	reindex_relation(OIDOldHeap, false);
+	for (i = 0; OidIsValid(mapped_tables[i]); i++)
+		RelationMapRemoveMapping(mapped_tables[i]);
 
 	/*
 	 * At this point, everything is kosher except that, if we did toast swap
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index bbec82f207..7e6be57ee8 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.190 2010/01/02 16:57:37 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.191 2010/02/07 20:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -244,10 +244,15 @@ DefineIndex(RangeVar *heapRelation,
 
 	/*
 	 * Force shared indexes into the pg_global tablespace.	This is a bit of a
-	 * hack but seems simpler than marking them in the BKI commands.
+	 * hack but seems simpler than marking them in the BKI commands.  On the
+	 * other hand, if it's not shared, don't allow it to be placed there.
 	 */
 	if (rel->rd_rel->relisshared)
 		tablespaceId = GLOBALTABLESPACE_OID;
+	else if (tablespaceId == GLOBALTABLESPACE_OID)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("only shared relations can be placed in pg_global tablespace")));
 
 	/*
 	 * Choose the index column names.
@@ -1615,16 +1620,9 @@ ReindexTable(RangeVar *relation)
 		aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
 					   relation->relname);
 
-	/* Can't reindex shared tables except in standalone mode */
-	if (((Form_pg_class) GETSTRUCT(tuple))->relisshared && IsUnderPostmaster)
-		ereport(ERROR,
-				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-				 errmsg("shared table \"%s\" can only be reindexed in stand-alone mode",
-						relation->relname)));
-
 	ReleaseSysCache(tuple);
 
-	if (!reindex_relation(heapOid, true))
+	if (!reindex_relation(heapOid, true, false))
 		ereport(NOTICE,
 				(errmsg("table \"%s\" has no indexes",
 						relation->relname)));
@@ -1717,12 +1715,6 @@ ReindexDatabase(const char *databaseName, bool do_system, bool do_user)
 				continue;
 		}
 
-		if (IsUnderPostmaster)	/* silently ignore shared tables */
-		{
-			if (classtuple->relisshared)
-				continue;
-		}
-
 		if (HeapTupleGetOid(tuple) == RelationRelationId)
 			continue;			/* got it already */
 
@@ -1743,7 +1735,7 @@ ReindexDatabase(const char *databaseName, bool do_system, bool do_user)
 		StartTransactionCommand();
 		/* functions in indexes may want a snapshot set */
 		PushActiveSnapshot(GetTransactionSnapshot());
-		if (reindex_relation(relid, true))
+		if (reindex_relation(relid, true, false))
 			ereport(NOTICE,
 					(errmsg("table \"%s\" was reindexed",
 							get_rel_name(relid))));
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index dba5f29d66..683c7f58d8 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/tablecmds.c,v 1.324 2010/02/04 00:09:14 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/tablecmds.c,v 1.325 2010/02/07 20:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -436,6 +436,12 @@ DefineRelation(CreateStmt *stmt, char relkind)
 						   get_tablespace_name(tablespaceId));
 	}
 
+	/* In all cases disallow placing user relations in pg_global */
+	if (tablespaceId == GLOBALTABLESPACE_OID)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("only shared relations can be placed in pg_global tablespace")));
+
 	/*
 	 * Parse and validate reloptions, if any.
 	 */
@@ -534,6 +540,7 @@ DefineRelation(CreateStmt *stmt, char relkind)
 													  old_constraints),
 										  relkind,
 										  false,
+										  false,
 										  localHasOids,
 										  parentOidCount,
 										  stmt->oncommit,
@@ -1014,7 +1021,7 @@ ExecuteTruncate(TruncateStmt *stmt)
 			/*
 			 * Reconstruct the indexes to match, and we're done.
 			 */
-			reindex_relation(heap_relid, true);
+			reindex_relation(heap_relid, true, false);
 		}
 	}
 
@@ -1091,16 +1098,6 @@ truncate_check_rel(Relation rel)
 				 errmsg("permission denied: \"%s\" is a system catalog",
 						RelationGetRelationName(rel))));
 
-	/*
-	 * We can never allow truncation of shared or nailed-in-cache relations,
-	 * because we can't support changing their relfilenode values.
-	 */
-	if (rel->rd_rel->relisshared || rel->rd_isnailed)
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot truncate system relation \"%s\"",
-						RelationGetRelationName(rel))));
-
 	/*
 	 * Don't allow truncate on temp tables of other backends ... their local
 	 * buffer manager is not going to cope.
@@ -2873,11 +2870,11 @@ ATRewriteTables(List **wqueue)
 			OldHeap = heap_open(tab->relid, NoLock);
 
 			/*
-			 * We can never allow rewriting of shared or nailed-in-cache
-			 * relations, because we can't support changing their relfilenode
-			 * values.
+			 * We don't support rewriting of system catalogs; there are
+			 * too many corner cases and too little benefit.  In particular
+			 * this is certainly not going to work for mapped catalogs.
 			 */
-			if (OldHeap->rd_rel->relisshared || OldHeap->rd_isnailed)
+			if (IsSystemRelation(OldHeap))
 				ereport(ERROR,
 						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 						 errmsg("cannot rewrite system relation \"%s\"",
@@ -2914,17 +2911,14 @@ ATRewriteTables(List **wqueue)
 			ATRewriteTable(tab, OIDNewHeap);
 
 			/*
-			 * Swap the physical files of the old and new heaps.  Since we are
-			 * generating a new heap, we can use RecentXmin for the table's
-			 * new relfrozenxid because we rewrote all the tuples on
-			 * ATRewriteTable, so no older Xid remains in the table.  Also,
-			 * we never try to swap toast tables by content, since we have
-			 * no interest in letting this code work on system catalogs.
+			 * Swap the physical files of the old and new heaps, then rebuild
+			 * indexes and discard the new heap.  We can use RecentXmin for
+			 * the table's new relfrozenxid because we rewrote all the tuples
+			 * in ATRewriteTable, so no older Xid remains in the table.  Also,
+			 * we never try to swap toast tables by content, since we have no
+			 * interest in letting this code work on system catalogs.
 			 */
-			swap_relation_files(tab->relid, OIDNewHeap, false, RecentXmin);
-
-			/* Destroy the new heap, removing the old data along with it. */
-			cleanup_heap_swap(tab->relid, OIDNewHeap, false);
+			finish_heap_swap(tab->relid, OIDNewHeap, false, false, RecentXmin);
 		}
 		else
 		{
@@ -3715,7 +3709,7 @@ ATExecAddColumn(AlteredTableInfo *tab, Relation rel,
 	typeOid = HeapTupleGetOid(typeTuple);
 
 	/* make sure datatype is legal for a column */
-	CheckAttributeType(colDef->colname, typeOid);
+	CheckAttributeType(colDef->colname, typeOid, false);
 
 	/* construct new attribute's pg_attribute entry */
 	attribute.attrelid = myrelid;
@@ -5825,7 +5819,7 @@ ATPrepAlterColumnType(List **wqueue,
 	targettype = typenameTypeId(NULL, typeName, &targettypmod);
 
 	/* make sure datatype is legal for a column */
-	CheckAttributeType(colName, targettype);
+	CheckAttributeType(colName, targettype, false);
 
 	/*
 	 * Set up an expression to transform the old data value to the new type.
@@ -6925,10 +6919,21 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace)
 	rel = relation_open(tableOid, AccessExclusiveLock);
 
 	/*
-	 * We can never allow moving of shared or nailed-in-cache relations,
-	 * because we can't support changing their reltablespace values.
+	 * No work if no change in tablespace.
+	 */
+	oldTableSpace = rel->rd_rel->reltablespace;
+	if (newTableSpace == oldTableSpace ||
+		(newTableSpace == MyDatabaseTableSpace && oldTableSpace == 0))
+	{
+		relation_close(rel, NoLock);
+		return;
+	}
+
+	/*
+	 * We cannot support moving mapped relations into different tablespaces.
+	 * (In particular this eliminates all shared catalogs.)
 	 */
-	if (rel->rd_rel->relisshared || rel->rd_isnailed)
+	if (RelationIsMapped(rel))
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("cannot move system relation \"%s\"",
@@ -6949,17 +6954,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace)
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("cannot move temporary tables of other sessions")));
 
-	/*
-	 * No work if no change in tablespace.
-	 */
-	oldTableSpace = rel->rd_rel->reltablespace;
-	if (newTableSpace == oldTableSpace ||
-		(newTableSpace == MyDatabaseTableSpace && oldTableSpace == 0))
-	{
-		relation_close(rel, NoLock);
-		return;
-	}
-
 	reltoastrelid = rel->rd_rel->reltoastrelid;
 	reltoastidxid = rel->rd_rel->reltoastidxid;
 
@@ -6985,9 +6979,7 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace)
 	 * Relfilenodes are not unique across tablespaces, so we need to allocate
 	 * a new one in the new tablespace.
 	 */
-	newrelfilenode = GetNewRelFileNode(newTableSpace,
-									   rel->rd_rel->relisshared,
-									   NULL);
+	newrelfilenode = GetNewRelFileNode(newTableSpace, NULL);
 
 	/* Open old and new relation */
 	newrnode = rel->rd_node;
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 850680950e..e18ed084b4 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.403 2010/01/06 05:31:13 itagaki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.404 2010/02/07 20:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1183,11 +1183,10 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound,
 
 	/*
 	 * Do the actual work --- either FULL, FULL INPLACE, or "lazy" vacuum.
-	 * We can use only FULL INPLACE vacuum for system relations.
 	 */
 	if (!(vacstmt->options & VACOPT_FULL))
 		heldoff = lazy_vacuum_rel(onerel, vacstmt, vac_strategy, scanned_all);
-	else if ((vacstmt->options & VACOPT_INPLACE) || IsSystemRelation(onerel))
+	else if (vacstmt->options & VACOPT_INPLACE)
 		heldoff = full_vacuum_rel(onerel, vacstmt);
 	else
 	{
@@ -1196,8 +1195,8 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound,
 		onerel = NULL;
 
 		cluster_rel(relid, InvalidOid, false,
-			(vacstmt->options & VACOPT_VERBOSE) != 0,
-			vacstmt->freeze_min_age, vacstmt->freeze_table_age);
+					(vacstmt->options & VACOPT_VERBOSE) != 0,
+					vacstmt->freeze_min_age, vacstmt->freeze_table_age);
 		heldoff = false;
 	}
 
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 5a8af0b2f7..85566b77dc 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -26,7 +26,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.344 2010/02/03 10:01:30 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.345 2010/02/07 20:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -2168,6 +2168,7 @@ OpenIntoRel(QueryDesc *queryDesc)
 											  NIL,
 											  RELKIND_RELATION,
 											  false,
+											  false,
 											  true,
 											  0,
 											  into->onCommit,
diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c
index 0f615e674c..54c5cb39e8 100644
--- a/src/backend/parser/parse_clause.c
+++ b/src/backend/parser/parse_clause.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.195 2010/01/02 16:57:49 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.196 2010/02/07 20:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -609,7 +609,7 @@ transformRangeFunction(ParseState *pstate, RangeFunction *r)
 		tupdesc = BuildDescFromLists(rte->eref->colnames,
 									 rte->funccoltypes,
 									 rte->funccoltypmods);
-		CheckAttributeNamesTypes(tupdesc, RELKIND_COMPOSITE_TYPE);
+		CheckAttributeNamesTypes(tupdesc, RELKIND_COMPOSITE_TYPE, false);
 	}
 
 	return rte;
diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c
index d2d9197587..d894e8906c 100644
--- a/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@ -1,11 +1,11 @@
 /*
  * dbsize.c
- *		object size functions
+ *		Database object size functions, and related inquiries
  *
  * Copyright (c) 2002-2010, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/dbsize.c,v 1.28 2010/01/23 21:29:00 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/dbsize.c,v 1.29 2010/02/07 20:48:10 tgl Exp $
  *
  */
 
@@ -25,6 +25,7 @@
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/rel.h"
+#include "utils/relmapper.h"
 #include "utils/syscache.h"
 
 
@@ -507,3 +508,121 @@ pg_size_pretty(PG_FUNCTION_ARGS)
 
 	PG_RETURN_TEXT_P(cstring_to_text(buf));
 }
+
+/*
+ * Get the filenode of a relation
+ *
+ * This is expected to be used in queries like
+ *		SELECT pg_relation_filenode(oid) FROM pg_class;
+ * That leads to a couple of choices.  We work from the pg_class row alone
+ * rather than actually opening each relation, for efficiency.  We don't
+ * fail if we can't find the relation --- some rows might be visible in
+ * the query's MVCC snapshot but already dead according to SnapshotNow.
+ * (Note: we could avoid using the catcache, but there's little point
+ * because the relation mapper also works "in the now".)  We also don't
+ * fail if the relation doesn't have storage.  In all these cases it
+ * seems better to quietly return NULL.
+ */
+Datum
+pg_relation_filenode(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	Oid			result;
+	HeapTuple	tuple;
+	Form_pg_class relform;
+
+	tuple = SearchSysCache(RELOID,
+						   ObjectIdGetDatum(relid),
+						   0, 0, 0);
+	if (!HeapTupleIsValid(tuple))
+		PG_RETURN_NULL();
+	relform = (Form_pg_class) GETSTRUCT(tuple);
+
+	switch (relform->relkind)
+	{
+		case RELKIND_RELATION:
+		case RELKIND_INDEX:
+		case RELKIND_SEQUENCE:
+		case RELKIND_TOASTVALUE:
+			/* okay, these have storage */
+			if (relform->relfilenode)
+				result = relform->relfilenode;
+			else				/* Consult the relation mapper */
+				result = RelationMapOidToFilenode(relid,
+												  relform->relisshared);
+			break;
+
+		default:
+			/* no storage, return NULL */
+			result = InvalidOid;
+			break;
+	}
+
+	ReleaseSysCache(tuple);
+
+	if (!OidIsValid(result))
+		PG_RETURN_NULL();
+
+	PG_RETURN_OID(result);
+}
+
+/*
+ * Get the pathname (relative to $PGDATA) of a relation
+ *
+ * See comments for pg_relation_filenode.
+ */
+Datum
+pg_relation_filepath(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	HeapTuple	tuple;
+	Form_pg_class relform;
+	RelFileNode rnode;
+	char	   *path;
+
+	tuple = SearchSysCache(RELOID,
+						   ObjectIdGetDatum(relid),
+						   0, 0, 0);
+	if (!HeapTupleIsValid(tuple))
+		PG_RETURN_NULL();
+	relform = (Form_pg_class) GETSTRUCT(tuple);
+
+	switch (relform->relkind)
+	{
+		case RELKIND_RELATION:
+		case RELKIND_INDEX:
+		case RELKIND_SEQUENCE:
+		case RELKIND_TOASTVALUE:
+			/* okay, these have storage */
+
+			/* This logic should match RelationInitPhysicalAddr */
+			if (relform->reltablespace)
+				rnode.spcNode = relform->reltablespace;
+			else
+				rnode.spcNode = MyDatabaseTableSpace;
+			if (rnode.spcNode == GLOBALTABLESPACE_OID)
+				rnode.dbNode = InvalidOid;
+			else
+				rnode.dbNode = MyDatabaseId;
+			if (relform->relfilenode)
+				rnode.relNode = relform->relfilenode;
+			else				/* Consult the relation mapper */
+				rnode.relNode = RelationMapOidToFilenode(relid,
+														 relform->relisshared);
+			break;
+
+		default:
+			/* no storage, return NULL */
+			rnode.relNode = InvalidOid;
+			break;
+	}
+
+	ReleaseSysCache(tuple);
+
+	if (!OidIsValid(rnode.relNode))
+		PG_RETURN_NULL();
+
+	path = relpath(rnode, MAIN_FORKNUM);
+
+	PG_RETURN_TEXT_P(cstring_to_text(path));
+}
diff --git a/src/backend/utils/cache/Makefile b/src/backend/utils/cache/Makefile
index 617cb677f7..d1caf8e4ae 100644
--- a/src/backend/utils/cache/Makefile
+++ b/src/backend/utils/cache/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for utils/cache
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/utils/cache/Makefile,v 1.25 2010/01/22 16:40:19 rhaas Exp $
+#    $PostgreSQL: pgsql/src/backend/utils/cache/Makefile,v 1.26 2010/02/07 20:48:10 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -12,7 +12,7 @@ subdir = src/backend/utils/cache
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = attoptcache.o catcache.o inval.o plancache.o relcache.o \
+OBJS = attoptcache.o catcache.o inval.o plancache.o relcache.o relmapper.o \
 	spccache.o syscache.o lsyscache.o typcache.o ts_cache.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c
index 8b606a8da2..aac1e87d87 100644
--- a/src/backend/utils/cache/catcache.c
+++ b/src/backend/utils/cache/catcache.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/cache/catcache.c,v 1.148 2010/01/02 16:57:55 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/catcache.c,v 1.149 2010/02/07 20:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -28,6 +28,7 @@
 #endif
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
+#include "utils/inval.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 #include "utils/resowner.h"
@@ -679,17 +680,6 @@ ResetCatalogCaches(void)
  *	or a temp table being dropped at end of transaction, or a table created
  *	during the current transaction that is being dropped because of abort.)
  *	Remove all cache entries relevant to the specified relation OID.
- *
- *	A special case occurs when relId is itself one of the cacheable system
- *	tables --- although those'll never be dropped, they can get flushed from
- *	the relcache (VACUUM causes this, for example).  In that case we need
- *	to flush all cache entries that came from that table.  (At one point we
- *	also tried to force re-execution of CatalogCacheInitializeCache for
- *	the cache(s) on that table.  This is a bad idea since it leads to all
- *	kinds of trouble if a cache flush occurs while loading cache entries.
- *	We now avoid the need to do it by copying cc_tupdesc out of the relcache,
- *	rather than relying on the relcache to keep a tupdesc for us.  Of course
- *	this assumes the tupdesc of a cachable system table will not change...)
  */
 void
 CatalogCacheFlushRelation(Oid relId)
@@ -706,14 +696,6 @@ CatalogCacheFlushRelation(Oid relId)
 		if (cache->cc_tupdesc == NULL)
 			continue;
 
-		/* Does this cache store tuples of the target relation itself? */
-		if (cache->cc_tupdesc->attrs[0]->attrelid == relId)
-		{
-			/* Yes, so flush all its contents */
-			ResetCatalogCache(cache);
-			continue;
-		}
-
 		/* Does this cache store tuples associated with relations at all? */
 		if (cache->cc_reloidattr == 0)
 			continue;			/* nope, leave it alone */
@@ -775,6 +757,46 @@ CatalogCacheFlushRelation(Oid relId)
 	CACHE1_elog(DEBUG2, "end of CatalogCacheFlushRelation call");
 }
 
+/*
+ *		CatalogCacheFlushCatalog
+ *
+ *	Flush all catcache entries that came from the specified system catalog.
+ *	This is needed after VACUUM FULL/CLUSTER on the catalog, since the
+ *	tuples very likely now have different TIDs than before.  (At one point
+ *	we also tried to force re-execution of CatalogCacheInitializeCache for
+ *	the cache(s) on that catalog.  This is a bad idea since it leads to all
+ *	kinds of trouble if a cache flush occurs while loading cache entries.
+ *	We now avoid the need to do it by copying cc_tupdesc out of the relcache,
+ *	rather than relying on the relcache to keep a tupdesc for us.  Of course
+ *	this assumes the tupdesc of a cachable system table will not change...)
+ */
+void
+CatalogCacheFlushCatalog(Oid catId)
+{
+	CatCache   *cache;
+
+	CACHE2_elog(DEBUG2, "CatalogCacheFlushCatalog called for %u", catId);
+
+	for (cache = CacheHdr->ch_caches; cache; cache = cache->cc_next)
+	{
+		/* We can ignore uninitialized caches, since they must be empty */
+		if (cache->cc_tupdesc == NULL)
+			continue;
+
+		/* Does this cache store tuples of the target catalog? */
+		if (cache->cc_tupdesc->attrs[0]->attrelid == catId)
+		{
+			/* Yes, so flush all its contents */
+			ResetCatalogCache(cache);
+
+			/* Tell inval.c to call syscache callbacks for this cache */
+			CallSyscacheCallbacks(cache->id, NULL);
+		}
+	}
+
+	CACHE1_elog(DEBUG2, "end of CatalogCacheFlushCatalog call");
+}
+
 /*
  *		InitCatCache
  *
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 99aad752bb..96439fda18 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -80,7 +80,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.93 2010/02/03 01:14:17 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.94 2010/02/07 20:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -96,6 +96,7 @@
 #include "utils/inval.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
+#include "utils/relmapper.h"
 #include "utils/syscache.h"
 
 
@@ -325,6 +326,21 @@ AddCatcacheInvalidationMessage(InvalidationListHeader *hdr,
 	AddInvalidationMessage(&hdr->cclist, &msg);
 }
 
+/*
+ * Add a whole-catalog inval entry
+ */
+static void
+AddCatalogInvalidationMessage(InvalidationListHeader *hdr,
+							  Oid dbId, Oid catId)
+{
+	SharedInvalidationMessage msg;
+
+	msg.cat.id = SHAREDINVALCATALOG_ID;
+	msg.cat.dbId = dbId;
+	msg.cat.catId = catId;
+	AddInvalidationMessage(&hdr->cclist, &msg);
+}
+
 /*
  * Add a relcache inval entry
  */
@@ -406,6 +422,18 @@ RegisterCatcacheInvalidation(int cacheId,
 								   cacheId, hashValue, tuplePtr, dbId);
 }
 
+/*
+ * RegisterCatalogInvalidation
+ *
+ * Register an invalidation event for all catcache entries from a catalog.
+ */
+static void
+RegisterCatalogInvalidation(Oid dbId, Oid catId)
+{
+	AddCatalogInvalidationMessage(&transInvalInfo->CurrentCmdInvalidMsgs,
+								  dbId, catId);
+}
+
 /*
  * RegisterRelcacheInvalidation
  *
@@ -443,30 +471,32 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId)
 static void
 LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
 {
-	int			i;
-
 	if (msg->id >= 0)
 	{
-		if (msg->cc.dbId == MyDatabaseId || msg->cc.dbId == 0)
+		if (msg->cc.dbId == MyDatabaseId || msg->cc.dbId == InvalidOid)
 		{
 			CatalogCacheIdInvalidate(msg->cc.id,
 									 msg->cc.hashValue,
 									 &msg->cc.tuplePtr);
 
-			for (i = 0; i < syscache_callback_count; i++)
-			{
-				struct SYSCACHECALLBACK *ccitem = syscache_callback_list + i;
+			CallSyscacheCallbacks(msg->cc.id, &msg->cc.tuplePtr);
+		}
+	}
+	else if (msg->id == SHAREDINVALCATALOG_ID)
+	{
+		if (msg->cat.dbId == MyDatabaseId || msg->cat.dbId == InvalidOid)
+		{
+			CatalogCacheFlushCatalog(msg->cat.catId);
 
-				if (ccitem->id == msg->cc.id)
-					(*ccitem->function) (ccitem->arg,
-										 msg->cc.id, &msg->cc.tuplePtr);
-			}
+			/* CatalogCacheFlushCatalog calls CallSyscacheCallbacks as needed */
 		}
 	}
 	else if (msg->id == SHAREDINVALRELCACHE_ID)
 	{
 		if (msg->rc.dbId == MyDatabaseId || msg->rc.dbId == InvalidOid)
 		{
+			int			i;
+
 			RelationCacheInvalidateEntry(msg->rc.relId);
 
 			for (i = 0; i < relcache_callback_count; i++)
@@ -485,6 +515,14 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
 		 */
 		smgrclosenode(msg->sm.rnode);
 	}
+	else if (msg->id == SHAREDINVALRELMAP_ID)
+	{
+		/* We only care about our own database and shared catalogs */
+		if (msg->rm.dbId == InvalidOid)
+			RelationMapInvalidate(true);
+		else if (msg->rm.dbId == MyDatabaseId)
+			RelationMapInvalidate(false);
+	}
 	else
 		elog(FATAL, "unrecognized SI message id: %d", msg->id);
 }
@@ -506,7 +544,7 @@ InvalidateSystemCaches(void)
 	int			i;
 
 	ResetCatalogCaches();
-	RelationCacheInvalidate();	/* gets smgr cache too */
+	RelationCacheInvalidate();	/* gets smgr and relmap too */
 
 	for (i = 0; i < syscache_callback_count; i++)
 	{
@@ -874,7 +912,7 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
 			else
 			{
 				/*
-				 * Invalidation message is a SHAREDINVALSMGR_ID
+				 * Invalidation message is a catalog or nontransactional inval,
 				 * which never cause relcache file invalidation,
 				 * so we ignore them, no matter which db they're for.
 				 */
@@ -1182,6 +1220,30 @@ CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple)
 	PrepareForTupleInvalidation(relation, tuple);
 }
 
+/*
+ * CacheInvalidateCatalog
+ *		Register invalidation of the whole content of a system catalog.
+ *
+ * This is normally used in VACUUM FULL/CLUSTER, where we haven't so much
+ * changed any tuples as moved them around.  Some uses of catcache entries
+ * expect their TIDs to be correct, so we have to blow away the entries.
+ *
+ * Note: we expect caller to verify that the rel actually is a system
+ * catalog.  If it isn't, no great harm is done, just a wasted sinval message.
+ */
+void
+CacheInvalidateCatalog(Oid catalogId)
+{
+	Oid			databaseId;
+
+	if (IsSharedRelation(catalogId))
+		databaseId = InvalidOid;
+	else
+		databaseId = MyDatabaseId;
+
+	RegisterCatalogInvalidation(databaseId, catalogId);
+}
+
 /*
  * CacheInvalidateRelcache
  *		Register invalidation of the specified relation's relcache entry
@@ -1277,6 +1339,31 @@ CacheInvalidateSmgr(RelFileNode rnode)
 	SendSharedInvalidMessages(&msg, 1);
 }
 
+/*
+ * CacheInvalidateRelmap
+ *		Register invalidation of the relation mapping for a database,
+ *		or for the shared catalogs if databaseId is zero.
+ *
+ * Sending this type of invalidation msg forces other backends to re-read
+ * the indicated relation mapping file.  It is also necessary to send a
+ * relcache inval for the specific relations whose mapping has been altered,
+ * else the relcache won't get updated with the new filenode data.
+ *
+ * Note: because these messages are nontransactional, they won't be captured
+ * in commit/abort WAL entries.  Instead, calls to CacheInvalidateRelmap()
+ * should happen in low-level relmapper.c routines, which are executed while
+ * replaying WAL as well as when creating it.
+ */
+void
+CacheInvalidateRelmap(Oid databaseId)
+{
+	SharedInvalidationMessage msg;
+
+	msg.rm.id = SHAREDINVALRELMAP_ID;
+	msg.rm.dbId = databaseId;
+	SendSharedInvalidMessages(&msg, 1);
+}
+
 
 /*
  * CacheRegisterSyscacheCallback
@@ -1323,3 +1410,23 @@ CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
 
 	++relcache_callback_count;
 }
+
+/*
+ * CallSyscacheCallbacks
+ *
+ * This is exported so that CatalogCacheFlushCatalog can call it, saving
+ * this module from knowing which catcache IDs correspond to which catalogs.
+ */
+void
+CallSyscacheCallbacks(int cacheid, ItemPointer tuplePtr)
+{
+	int			i;
+
+	for (i = 0; i < syscache_callback_count; i++)
+	{
+		struct SYSCACHECALLBACK *ccitem = syscache_callback_list + i;
+
+		if (ccitem->id == cacheid)
+			(*ccitem->function) (ccitem->arg, cacheid, tuplePtr);
+	}
+}
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index e71416c0f7..ff85195ed1 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.302 2010/02/04 00:09:14 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.303 2010/02/07 20:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -72,6 +72,7 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/relcache.h"
+#include "utils/relmapper.h"
 #include "utils/resowner.h"
 #include "utils/syscache.h"
 #include "utils/tqual.h"
@@ -838,6 +839,7 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
 	 */
 	relid = HeapTupleGetOid(pg_class_tuple);
 	relp = (Form_pg_class) GETSTRUCT(pg_class_tuple);
+	Assert(relid == targetRelId);
 
 	/*
 	 * allocate storage for the relation descriptor, and copy pg_class_tuple
@@ -927,6 +929,10 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
 
 /*
  * Initialize the physical addressing info (RelFileNode) for a relcache entry
+ *
+ * Note: at the physical level, relations in the pg_global tablespace must
+ * be treated as shared, even if relisshared isn't set.  Hence we do not
+ * look at relisshared here.
  */
 static void
 RelationInitPhysicalAddr(Relation relation)
@@ -935,11 +941,22 @@ RelationInitPhysicalAddr(Relation relation)
 		relation->rd_node.spcNode = relation->rd_rel->reltablespace;
 	else
 		relation->rd_node.spcNode = MyDatabaseTableSpace;
-	if (relation->rd_rel->relisshared)
+	if (relation->rd_node.spcNode == GLOBALTABLESPACE_OID)
 		relation->rd_node.dbNode = InvalidOid;
 	else
 		relation->rd_node.dbNode = MyDatabaseId;
-	relation->rd_node.relNode = relation->rd_rel->relfilenode;
+	if (relation->rd_rel->relfilenode)
+		relation->rd_node.relNode = relation->rd_rel->relfilenode;
+	else
+	{
+		/* Consult the relation mapper */
+		relation->rd_node.relNode =
+			RelationMapOidToFilenode(relation->rd_id,
+									 relation->rd_rel->relisshared);
+		if (!OidIsValid(relation->rd_node.relNode))
+			elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
+				 RelationGetRelationName(relation), relation->rd_id);
+	}
 }
 
 /*
@@ -1496,7 +1513,18 @@ formrdesc(const char *relationName, Oid relationReltype,
 	 * initialize relation id from info in att array (my, this is ugly)
 	 */
 	RelationGetRelid(relation) = relation->rd_att->attrs[0]->attrelid;
-	relation->rd_rel->relfilenode = RelationGetRelid(relation);
+
+	/*
+	 * All relations made with formrdesc are mapped.  This is necessarily so
+	 * because there is no other way to know what filenode they currently
+	 * have.  In bootstrap mode, add them to the initial relation mapper data,
+	 * specifying that the initial filenode is the same as the OID.
+	 */
+	relation->rd_rel->relfilenode = InvalidOid;
+	if (IsBootstrapProcessingMode())
+		RelationMapUpdateMap(RelationGetRelid(relation),
+							 RelationGetRelid(relation),
+							 isshared, true);
 
 	/*
 	 * initialize the relation lock manager information
@@ -1841,7 +1869,9 @@ RelationClearRelation(Relation relation, bool rebuild)
 	 * Never, never ever blow away a nailed-in system relation, because we'd
 	 * be unable to recover.  However, we must reset rd_targblock, in case we
 	 * got called because of a relation cache flush that was triggered by
-	 * VACUUM.  Likewise reset the fsm and vm size info.
+	 * VACUUM.  Likewise reset the fsm and vm size info.  Also, redo
+	 * RelationInitPhysicalAddr in case it is a mapped relation whose mapping
+	 * changed.
 	 *
 	 * If it's a nailed index, then we need to re-read the pg_class row to see
 	 * if its relfilenode changed.	We can't necessarily do that here, because
@@ -1855,6 +1885,9 @@ RelationClearRelation(Relation relation, bool rebuild)
 		relation->rd_targblock = InvalidBlockNumber;
 		relation->rd_fsm_nblocks = InvalidBlockNumber;
 		relation->rd_vm_nblocks = InvalidBlockNumber;
+		/* We must recalculate physical address in case it changed */
+		RelationInitPhysicalAddr(relation);
+
 		if (relation->rd_rel->relkind == RELKIND_INDEX)
 		{
 			relation->rd_isvalid = false;		/* needs to be revalidated */
@@ -1885,7 +1918,8 @@ RelationClearRelation(Relation relation, bool rebuild)
 
 	/*
 	 * Clear out catcache's entries for this relation.  This is a bit of
-	 * a hack, but it's a convenient place to do it.
+	 * a hack, but it's a convenient place to do it.  (XXX do we really
+	 * still need this?)
 	 */
 	CatalogCacheFlushRelation(RelationGetRelid(relation));
 
@@ -2104,7 +2138,7 @@ RelationCacheInvalidateEntry(Oid relationId)
  * RelationCacheInvalidate
  *	 Blow away cached relation descriptors that have zero reference counts,
  *	 and rebuild those with positive reference counts.	Also reset the smgr
- *	 relation cache.
+ *	 relation cache and re-read relation mapping data.
  *
  *	 This is currently used only to recover from SI message buffer overflow,
  *	 so we do not touch new-in-transaction relations; they cannot be targets
@@ -2190,6 +2224,11 @@ RelationCacheInvalidate(void)
 	 */
 	smgrcloseall();
 
+	/*
+	 * Reload relation mapping data before starting to reconstruct cache.
+	 */
+	RelationMapInvalidateAll();
+
 	/* Phase 2: rebuild the items found to need rebuild in phase 1 */
 	foreach(l, rebuildFirstList)
 	{
@@ -2205,6 +2244,25 @@ RelationCacheInvalidate(void)
 	list_free(rebuildList);
 }
 
+/*
+ * RelationCloseSmgrByOid - close a relcache entry's smgr link
+ *
+ * Needed in some cases where we are changing a relation's physical mapping.
+ * The link will be automatically reopened on next use.
+ */
+void
+RelationCloseSmgrByOid(Oid relationId)
+{
+	Relation	relation;
+
+	RelationIdCacheLookup(relationId, relation);
+
+	if (!PointerIsValid(relation))
+		return;					/* not in cache, nothing to do */
+
+	RelationCloseSmgr(relation);
+}
+
 /*
  * AtEOXact_RelationCache
  *
@@ -2393,7 +2451,8 @@ RelationBuildLocalRelation(const char *relname,
 						   TupleDesc tupDesc,
 						   Oid relid,
 						   Oid reltablespace,
-						   bool shared_relation)
+						   bool shared_relation,
+						   bool mapped_relation)
 {
 	Relation	rel;
 	MemoryContext oldcxt;
@@ -2409,6 +2468,8 @@ RelationBuildLocalRelation(const char *relname,
 	 *
 	 * XXX this list had better match the relations specially handled in
 	 * RelationCacheInitializePhase2/3.
+	 *
+	 * XXX do we need this at all??
 	 */
 	switch (relid)
 	{
@@ -2434,6 +2495,9 @@ RelationBuildLocalRelation(const char *relname,
 		elog(ERROR, "shared_relation flag for \"%s\" does not match IsSharedRelation(%u)",
 			 relname, relid);
 
+	/* Shared relations had better be mapped, too */
+	Assert(mapped_relation || !shared_relation);
+
 	/*
 	 * switch to the cache context to create the relcache entry.
 	 */
@@ -2512,7 +2576,9 @@ RelationBuildLocalRelation(const char *relname,
 	/*
 	 * Insert relation physical and logical identifiers (OIDs) into the right
 	 * places.	Note that the physical ID (relfilenode) is initially the same
-	 * as the logical ID (OID).
+	 * as the logical ID (OID); except that for a mapped relation, we set
+	 * relfilenode to zero and rely on RelationInitPhysicalAddr to consult
+	 * the map.
 	 */
 	rel->rd_rel->relisshared = shared_relation;
 	rel->rd_rel->relistemp = rel->rd_istemp;
@@ -2522,9 +2588,17 @@ RelationBuildLocalRelation(const char *relname,
 	for (i = 0; i < natts; i++)
 		rel->rd_att->attrs[i]->attrelid = relid;
 
-	rel->rd_rel->relfilenode = relid;
 	rel->rd_rel->reltablespace = reltablespace;
 
+	if (mapped_relation)
+	{
+		rel->rd_rel->relfilenode = InvalidOid;
+		/* Add it to the active mapping information */
+		RelationMapUpdateMap(relid, relid, shared_relation, true);
+	}
+	else
+		rel->rd_rel->relfilenode = relid;
+
 	RelationInitLockInfo(rel);	/* see lmgr.c */
 
 	RelationInitPhysicalAddr(rel);
@@ -2577,24 +2651,16 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid)
 	HeapTuple	tuple;
 	Form_pg_class classform;
 
-	/* Can't change relfilenode for nailed tables (indexes ok though) */
-	Assert(!relation->rd_isnailed ||
-		   relation->rd_rel->relkind == RELKIND_INDEX);
-	/* Can't change for shared tables or indexes */
-	Assert(!relation->rd_rel->relisshared);
 	/* Indexes must have Invalid frozenxid; other relations must not */
 	Assert((relation->rd_rel->relkind == RELKIND_INDEX &&
 			freezeXid == InvalidTransactionId) ||
 		   TransactionIdIsNormal(freezeXid));
 
 	/* Allocate a new relfilenode */
-	newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace,
-									   relation->rd_rel->relisshared,
-									   NULL);
+	newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL);
 
 	/*
-	 * Find the pg_class tuple for the given relation.	This is not used
-	 * during bootstrap, so okay to use heap_update always.
+	 * Get a writable copy of the pg_class tuple for the given relation.
 	 */
 	pg_class = heap_open(RelationRelationId, RowExclusiveLock);
 
@@ -2623,12 +2689,23 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid)
 	RelationDropStorage(relation);
 
 	/*
-	 * Now update the pg_class row.
+	 * Now update the pg_class row.  However, if we're dealing with a mapped
+	 * index, pg_class.relfilenode doesn't change; instead we have to send
+	 * the update to the relation mapper.
 	 */
-	classform->relfilenode = newrelfilenode;
+	if (RelationIsMapped(relation))
+		RelationMapUpdateMap(RelationGetRelid(relation),
+							 newrelfilenode,
+							 relation->rd_rel->relisshared,
+							 false);
+	else
+		classform->relfilenode = newrelfilenode;
+
+	/* These changes are safe even for a mapped relation */
 	classform->relpages = 0;		/* it's empty until further notice */
 	classform->reltuples = 0;
 	classform->relfrozenxid = freezeXid;
+
 	simple_heap_update(pg_class, &tuple->t_self, tuple);
 	CatalogUpdateIndexes(pg_class, tuple);
 
@@ -2637,8 +2714,8 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid)
 	heap_close(pg_class, RowExclusiveLock);
 
 	/*
-	 * Make the pg_class row change visible.  This will cause the relcache
-	 * entry to get updated, too.
+	 * Make the pg_class row change visible, as well as the relation map
+	 * change if any.  This will cause the relcache entry to get updated, too.
 	 */
 	CommandCounterIncrement();
 
@@ -2687,6 +2764,11 @@ RelationCacheInitialize(void)
 	ctl.hash = oid_hash;
 	RelationIdCache = hash_create("Relcache by OID", INITRELCACHESIZE,
 								  &ctl, HASH_ELEM | HASH_FUNCTION);
+
+	/*
+	 * relation mapper needs initialized too
+	 */
+	RelationMapInitialize();
 }
 
 /*
@@ -2704,6 +2786,11 @@ RelationCacheInitializePhase2(void)
 {
 	MemoryContext oldcxt;
 
+	/*
+	 * relation mapper needs initialized too
+	 */
+	RelationMapInitializePhase2();
+
 	/*
 	 * In bootstrap mode, pg_database isn't there yet anyway, so do nothing.
 	 */
@@ -2752,6 +2839,11 @@ RelationCacheInitializePhase3(void)
 	MemoryContext oldcxt;
 	bool		needNewCacheFile = !criticalSharedRelcachesBuilt;
 
+	/*
+	 * relation mapper needs initialized too
+	 */
+	RelationMapInitializePhase3();
+
 	/*
 	 * switch to cache memory context
 	 */
diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c
new file mode 100644
index 0000000000..b22cadf6eb
--- /dev/null
+++ b/src/backend/utils/cache/relmapper.c
@@ -0,0 +1,913 @@
+/*-------------------------------------------------------------------------
+ *
+ * relmapper.c
+ *	  Catalog-to-filenode mapping
+ *
+ * For most tables, the physical file underlying the table is specified by
+ * pg_class.relfilenode.  However, that obviously won't work for pg_class
+ * itself, nor for the other "nailed" catalogs for which we have to be able
+ * to set up working Relation entries without access to pg_class.  It also
+ * does not work for shared catalogs, since there is no practical way to
+ * update other databases' pg_class entries when relocating a shared catalog.
+ * Therefore, for these special catalogs (henceforth referred to as "mapped
+ * catalogs") we rely on a separately maintained file that shows the mapping
+ * from catalog OIDs to filenode numbers.  Each database has a map file for
+ * its local mapped catalogs, and there is a separate map file for shared
+ * catalogs.  Mapped catalogs have zero in their pg_class.relfilenode entries.
+ *
+ * Relocation of a normal table is committed (ie, the new physical file becomes
+ * authoritative) when the pg_class row update commits.  For mapped catalogs,
+ * the act of updating the map file is effectively commit of the relocation.
+ * We postpone the file update till just before commit of the transaction
+ * doing the rewrite, but there is necessarily a window between.  Therefore
+ * mapped catalogs can only be relocated by operations such as VACUUM FULL
+ * and CLUSTER, which make no transactionally-significant changes: it must be
+ * safe for the new file to replace the old, even if the transaction itself
+ * aborts.  An important factor here is that the indexes and toast table of
+ * a mapped catalog must also be mapped, so that the rewrites/relocations of
+ * all these files commit in a single map file update rather than being tied
+ * to transaction commit.
+ *
+ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/relmapper.c,v 1.1 2010/02/07 20:48:10 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "access/xact.h"
+#include "catalog/catalog.h"
+#include "catalog/pg_tablespace.h"
+#include "catalog/storage.h"
+#include "miscadmin.h"
+#include "storage/fd.h"
+#include "storage/lwlock.h"
+#include "utils/inval.h"
+#include "utils/pg_crc.h"
+#include "utils/relmapper.h"
+
+
+/*
+ * The map file is critical data: we have no automatic method for recovering
+ * from loss or corruption of it.  We use a CRC so that we can detect
+ * corruption.  To minimize the risk of failed updates, the map file should
+ * be kept to no more than one standard-size disk sector (ie 512 bytes),
+ * and we use overwrite-in-place rather than playing renaming games.
+ * The struct layout below is designed to occupy exactly 512 bytes, which
+ * might make filesystem updates a bit more efficient.
+ *
+ * Entries in the mappings[] array are in no particular order.  We could
+ * speed searching by insisting on OID order, but it really shouldn't be
+ * worth the trouble given the intended size of the mapping sets.
+ */
+#define RELMAPPER_FILENAME		"pg_filenode.map"
+
+#define RELMAPPER_FILEMAGIC		0x592717	/* version ID value */
+
+#define MAX_MAPPINGS			62			/* 62 * 8 + 16 = 512 */
+
+typedef struct RelMapping
+{
+	Oid			mapoid;			/* OID of a catalog */
+	Oid			mapfilenode;	/* its filenode number */
+} RelMapping;
+
+typedef struct RelMapFile
+{
+	int32		magic;			/* always RELMAPPER_FILEMAGIC */
+	int32		num_mappings;	/* number of valid RelMapping entries */
+	RelMapping	mappings[MAX_MAPPINGS];
+	int32		crc;			/* CRC of all above */
+	int32		pad;			/* to make the struct size be 512 exactly */
+} RelMapFile;
+
+/*
+ * The currently known contents of the shared map file and our database's
+ * local map file are stored here.  These can be reloaded from disk
+ * immediately whenever we receive an update sinval message.
+ */
+static RelMapFile shared_map;
+static RelMapFile local_map;
+
+/*
+ * We use the same RelMapFile data structure to track uncommitted local
+ * changes in the mappings (but note the magic and crc fields are not made
+ * valid in these variables).  Currently, map updates are not allowed within
+ * subtransactions, so one set of transaction-level changes is sufficient.
+ *
+ * The active_xxx variables contain updates that are valid in our transaction
+ * and should be honored by RelationMapOidToFilenode.  The pending_xxx
+ * variables contain updates we have been told about that aren't active yet;
+ * they will become active at the next CommandCounterIncrement.  This setup
+ * lets map updates act similarly to updates of pg_class rows, ie, they
+ * become visible only at the next CommandCounterIncrement boundary.
+ */
+static RelMapFile active_shared_updates;
+static RelMapFile active_local_updates;
+static RelMapFile pending_shared_updates;
+static RelMapFile pending_local_updates;
+
+
+/* non-export function prototypes */
+static void apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode,
+							 bool add_okay);
+static void merge_map_updates(RelMapFile *map, const RelMapFile *updates,
+							  bool add_okay);
+static void load_relmap_file(bool shared);
+static void write_relmap_file(bool shared, RelMapFile *newmap,
+				  bool write_wal, bool send_sinval, bool preserve_files,
+				  Oid dbid, Oid tsid, const char *dbpath);
+static void perform_relmap_update(bool shared, const RelMapFile *updates);
+
+
+/*
+ * RelationMapOidToFilenode
+ *
+ * The raison d' etre ... given a relation OID, look up its filenode.
+ *
+ * Although shared and local relation OIDs should never overlap, the caller
+ * always knows which we need --- so pass that information to avoid useless
+ * searching.
+ *
+ * Returns InvalidOid if the OID is not known (which should never happen,
+ * but the caller is in a better position to report a meaningful error).
+ */
+Oid
+RelationMapOidToFilenode(Oid relationId, bool shared)
+{
+	const RelMapFile *map;
+	int32		i;
+
+	/* If there are active updates, believe those over the main maps */
+	if (shared)
+	{
+		map = &active_shared_updates;
+		for (i = 0; i < map->num_mappings; i++)
+		{
+			if (relationId == map->mappings[i].mapoid)
+				return map->mappings[i].mapfilenode;
+		}
+		map = &shared_map;
+		for (i = 0; i < map->num_mappings; i++)
+		{
+			if (relationId == map->mappings[i].mapoid)
+				return map->mappings[i].mapfilenode;
+		}
+	}
+	else
+	{
+		map = &active_local_updates;
+		for (i = 0; i < map->num_mappings; i++)
+		{
+			if (relationId == map->mappings[i].mapoid)
+				return map->mappings[i].mapfilenode;
+		}
+		map = &local_map;
+		for (i = 0; i < map->num_mappings; i++)
+		{
+			if (relationId == map->mappings[i].mapoid)
+				return map->mappings[i].mapfilenode;
+		}
+	}
+
+	return InvalidOid;
+}
+
+/*
+ * RelationMapUpdateMap
+ *
+ * Install a new relfilenode mapping for the specified relation.
+ *
+ * If immediate is true (or we're bootstrapping), the mapping is activated
+ * immediately.  Otherwise it is made pending until CommandCounterIncrement.
+ */
+void
+RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared,
+					 bool immediate)
+{
+	RelMapFile *map;
+
+	if (IsBootstrapProcessingMode())
+	{
+		/*
+		 * In bootstrap mode, the mapping gets installed in permanent map.
+		 */
+		if (shared)
+			map = &shared_map;
+		else
+			map = &local_map;
+	}
+	else
+	{
+		/*
+		 * We don't currently support map changes within subtransactions.
+		 * This could be done with more bookkeeping infrastructure, but it
+		 * doesn't presently seem worth it.
+		 */
+		if (GetCurrentTransactionNestLevel() > 1)
+			elog(ERROR, "cannot change relation mapping within subtransaction");
+
+		if (immediate)
+		{
+			/* Make it active, but only locally */
+			if (shared)
+				map = &active_shared_updates;
+			else
+				map = &active_local_updates;
+		}
+		else
+		{
+			/* Make it pending */
+			if (shared)
+				map = &pending_shared_updates;
+			else
+				map = &pending_local_updates;
+		}
+	}
+	apply_map_update(map, relationId, fileNode, true);
+}
+
+/*
+ * apply_map_update
+ *
+ * Insert a new mapping into the given map variable, replacing any existing
+ * mapping for the same relation.
+ *
+ * In some cases the caller knows there must be an existing mapping; pass
+ * add_okay = false to draw an error if not.
+ */
+static void
+apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay)
+{
+	int32		i;
+
+	/* Replace any existing mapping */
+	for (i = 0; i < map->num_mappings; i++)
+	{
+		if (relationId == map->mappings[i].mapoid)
+		{
+			map->mappings[i].mapfilenode = fileNode;
+			return;
+		}
+	}
+
+	/* Nope, need to add a new mapping */
+	if (!add_okay)
+		elog(ERROR, "attempt to apply a mapping to unmapped relation %u",
+			 relationId);
+	if (map->num_mappings >= MAX_MAPPINGS)
+		elog(ERROR, "ran out of space in relation map");
+	map->mappings[map->num_mappings].mapoid = relationId;
+	map->mappings[map->num_mappings].mapfilenode = fileNode;
+	map->num_mappings++;
+}
+
+/*
+ * merge_map_updates
+ *
+ * Merge all the updates in the given pending-update map into the target map.
+ * This is just a bulk form of apply_map_update.
+ */
+static void
+merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay)
+{
+	int32		i;
+
+	for (i = 0; i < updates->num_mappings; i++)
+	{
+		apply_map_update(map,
+						 updates->mappings[i].mapoid,
+						 updates->mappings[i].mapfilenode,
+						 add_okay);
+	}
+}
+
+/*
+ * RelationMapRemoveMapping
+ *
+ * Remove a relation's entry in the map.  This is only allowed for "active"
+ * (but not committed) local mappings.  We need it so we can back out the
+ * entry for the transient target file when doing VACUUM FULL/CLUSTER on
+ * a mapped relation.
+ */
+void
+RelationMapRemoveMapping(Oid relationId)
+{
+	RelMapFile *map = &active_local_updates;
+	int32		i;
+
+	for (i = 0; i < map->num_mappings; i++)
+	{
+		if (relationId == map->mappings[i].mapoid)
+		{
+			/* Found it, collapse it out */
+			map->mappings[i] = map->mappings[map->num_mappings - 1];
+			map->num_mappings--;
+			return;
+		}
+	}
+	elog(ERROR, "could not find temporary mapping for relation %u",
+		 relationId);
+}
+
+/*
+ * RelationMapInvalidate
+ *
+ * This routine is invoked for SI cache flush messages.  We must re-read
+ * the indicated map file.  However, we might receive a SI message in a
+ * process that hasn't yet, and might never, load the mapping files;
+ * for example the autovacuum launcher, which *must not* try to read
+ * a local map since it is attached to no particular database.
+ * So, re-read only if the map is valid now.
+ */
+void
+RelationMapInvalidate(bool shared)
+{
+	if (shared)
+	{
+		if (shared_map.magic == RELMAPPER_FILEMAGIC)
+			load_relmap_file(true);
+	}
+	else
+	{
+		if (local_map.magic == RELMAPPER_FILEMAGIC)
+			load_relmap_file(false);
+	}
+}
+
+/*
+ * RelationMapInvalidateAll
+ *
+ * Reload all map files.  This is used to recover from SI message buffer
+ * overflow: we can't be sure if we missed an inval message.
+ * Again, reload only currently-valid maps.
+ */
+void
+RelationMapInvalidateAll(void)
+{
+	if (shared_map.magic == RELMAPPER_FILEMAGIC)
+		load_relmap_file(true);
+	if (local_map.magic == RELMAPPER_FILEMAGIC)
+		load_relmap_file(false);
+}
+
+/*
+ * AtCCI_RelationMap
+ *
+ * Activate any "pending" relation map updates at CommandCounterIncrement time.
+ */
+void
+AtCCI_RelationMap(void)
+{
+	if (pending_shared_updates.num_mappings != 0)
+	{
+		merge_map_updates(&active_shared_updates,
+						  &pending_shared_updates,
+						  true);
+		pending_shared_updates.num_mappings = 0;
+	}
+	if (pending_local_updates.num_mappings != 0)
+	{
+		merge_map_updates(&active_local_updates,
+						  &pending_local_updates,
+						  true);
+		pending_local_updates.num_mappings = 0;
+	}
+}
+
+/*
+ * AtEOXact_RelationMap
+ *
+ * Handle relation mapping at main-transaction commit or abort.
+ *
+ * During commit, this must be called as late as possible before the actual
+ * transaction commit, so as to minimize the window where the transaction
+ * could still roll back after committing map changes.  Although nothing
+ * critically bad happens in such a case, we still would prefer that it
+ * not happen, since we'd possibly be losing useful updates to the relations'
+ * pg_class row(s).
+ *
+ * During abort, we just have to throw away any pending map changes.
+ * Normal post-abort cleanup will take care of fixing relcache entries.
+ */
+void
+AtEOXact_RelationMap(bool isCommit)
+{
+	if (isCommit)
+	{
+		/*
+		 * We should not get here with any "pending" updates.  (We could
+		 * logically choose to treat such as committed, but in the current
+		 * code this should never happen.)
+		 */
+		Assert(pending_shared_updates.num_mappings == 0);
+		Assert(pending_local_updates.num_mappings == 0);
+
+		/*
+		 * Write any active updates to the actual map files, then reset them.
+		 */
+		if (active_shared_updates.num_mappings != 0)
+		{
+			perform_relmap_update(true, &active_shared_updates);
+			active_shared_updates.num_mappings = 0;
+		}
+		if (active_local_updates.num_mappings != 0)
+		{
+			perform_relmap_update(false, &active_local_updates);
+			active_local_updates.num_mappings = 0;
+		}
+	}
+	else
+	{
+		/* Abort --- drop all local and pending updates */
+		active_shared_updates.num_mappings = 0;
+		active_local_updates.num_mappings = 0;
+		pending_shared_updates.num_mappings = 0;
+		pending_local_updates.num_mappings = 0;
+	}
+}
+
+/*
+ * AtPrepare_RelationMap
+ *
+ * Handle relation mapping at PREPARE.
+ *
+ * Currently, we don't support preparing any transaction that changes the map.
+ */
+void
+AtPrepare_RelationMap(void)
+{
+	if (active_shared_updates.num_mappings != 0 ||
+		active_local_updates.num_mappings != 0 ||
+		pending_shared_updates.num_mappings != 0 ||
+		pending_local_updates.num_mappings != 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot PREPARE a transaction that modified relation mapping")));
+}
+
+/*
+ * CheckPointRelationMap
+ *
+ * This is called during a checkpoint.  It must ensure that any relation map
+ * updates that were WAL-logged before the start of the checkpoint are
+ * securely flushed to disk and will not need to be replayed later.  This
+ * seems unlikely to be a performance-critical issue, so we use a simple
+ * method: we just take and release the RelationMappingLock.  This ensures
+ * that any already-logged map update is complete, because write_relmap_file
+ * will fsync the map file before the lock is released.
+ */
+void
+CheckPointRelationMap(void)
+{
+	LWLockAcquire(RelationMappingLock, LW_SHARED);
+	LWLockRelease(RelationMappingLock);
+}
+
+/*
+ * RelationMapFinishBootstrap
+ *
+ * Write out the initial relation mapping files at the completion of
+ * bootstrap.  All the mapped files should have been made known to us
+ * via RelationMapUpdateMap calls.
+ */
+void
+RelationMapFinishBootstrap(void)
+{
+	Assert(IsBootstrapProcessingMode());
+
+	/* Shouldn't be anything "pending" ... */
+	Assert(active_shared_updates.num_mappings == 0);
+	Assert(active_local_updates.num_mappings == 0);
+	Assert(pending_shared_updates.num_mappings == 0);
+	Assert(pending_local_updates.num_mappings == 0);
+
+	/* Write the files; no WAL or sinval needed */
+	write_relmap_file(true, &shared_map, false, false, false,
+					  InvalidOid, GLOBALTABLESPACE_OID, NULL);
+	write_relmap_file(false, &local_map, false, false, false,
+					  MyDatabaseId, MyDatabaseTableSpace, DatabasePath);
+}
+
+/*
+ * RelationMapInitialize
+ *
+ * This initializes the mapper module at process startup.  We can't access the
+ * database yet, so just make sure the maps are empty.
+ */
+void
+RelationMapInitialize(void)
+{
+	/* The static variables should initialize to zeroes, but let's be sure */
+	shared_map.magic = 0;		/* mark it not loaded */
+	local_map.magic = 0;
+	shared_map.num_mappings = 0;
+	local_map.num_mappings = 0;
+	active_shared_updates.num_mappings = 0;
+	active_local_updates.num_mappings = 0;
+	pending_shared_updates.num_mappings = 0;
+	pending_local_updates.num_mappings = 0;
+}
+
+/*
+ * RelationMapInitializePhase2
+ *
+ * This is called to prepare for access to pg_database during startup.
+ * We should be able to read the shared map file now.
+ */
+void
+RelationMapInitializePhase2(void)
+{
+	/*
+	 * In bootstrap mode, the map file isn't there yet, so do nothing.
+	 */
+	if (IsBootstrapProcessingMode())
+		return;
+
+	/*
+	 * Load the shared map file, die on error.
+	 */
+	load_relmap_file(true);
+}
+
+/*
+ * RelationMapInitializePhase3
+ *
+ * This is called as soon as we have determined MyDatabaseId and set up
+ * DatabasePath.  At this point we should be able to read the local map file.
+ */
+void
+RelationMapInitializePhase3(void)
+{
+	/*
+	 * In bootstrap mode, the map file isn't there yet, so do nothing.
+	 */
+	if (IsBootstrapProcessingMode())
+		return;
+
+	/*
+	 * Load the local map file, die on error.
+	 */
+	load_relmap_file(false);
+}
+
+/*
+ * load_relmap_file -- load data from the shared or local map file
+ *
+ * Because the map file is essential for access to core system catalogs,
+ * failure to read it is a fatal error.
+ *
+ * Note that the local case requires DatabasePath to be set up.
+ */
+static void
+load_relmap_file(bool shared)
+{
+	RelMapFile *map;
+	char		mapfilename[MAXPGPATH];
+	pg_crc32	crc;
+	int			fd;
+
+	if (shared)
+	{
+		snprintf(mapfilename, sizeof(mapfilename), "global/%s",
+				 RELMAPPER_FILENAME);
+		map = &shared_map;
+	}
+	else
+	{
+		snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
+				 DatabasePath, RELMAPPER_FILENAME);
+		map = &local_map;
+	}
+
+	/* Read data ... */
+	fd = BasicOpenFile(mapfilename, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR);
+	if (fd < 0)
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg("could not open relation mapping file \"%s\": %m",
+						mapfilename)));
+
+	/*
+	 * Note: we could take RelationMappingLock in shared mode here, but it
+	 * seems unnecessary since our read() should be atomic against any
+	 * concurrent updater's write().  If the file is updated shortly after
+	 * we look, the sinval signaling mechanism will make us re-read it
+	 * before we are able to access any relation that's affected by the
+	 * change.
+	 */
+	if (read(fd, map, sizeof(RelMapFile)) != sizeof(RelMapFile))
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg("could not read relation mapping file \"%s\": %m",
+						mapfilename)));
+
+	close(fd);
+
+	/* check for correct magic number, etc */
+	if (map->magic != RELMAPPER_FILEMAGIC ||
+		map->num_mappings < 0 ||
+		map->num_mappings > MAX_MAPPINGS)
+		ereport(FATAL,
+				(errmsg("relation mapping file \"%s\" contains invalid data",
+						mapfilename)));
+
+	/* verify the CRC */
+	INIT_CRC32(crc);
+	COMP_CRC32(crc, (char *) map, offsetof(RelMapFile, crc));
+	FIN_CRC32(crc);
+
+	if (!EQ_CRC32(crc, map->crc))
+		ereport(FATAL,
+				(errmsg("relation mapping file \"%s\" contains incorrect checksum",
+						mapfilename)));
+}
+
+/*
+ * Write out a new shared or local map file with the given contents.
+ *
+ * The magic number and CRC are automatically updated in *newmap.  On
+ * success, we copy the data to the appropriate permanent static variable.
+ *
+ * If write_wal is TRUE then an appropriate WAL message is emitted.
+ * (It will be false for bootstrap and WAL replay cases.)
+ *
+ * If send_sinval is TRUE then a SI invalidation message is sent.
+ * (This should be true except in bootstrap case.)
+ *
+ * If preserve_files is TRUE then the storage manager is warned not to
+ * delete the files listed in the map.
+ *
+ * Because this may be called during WAL replay when MyDatabaseId,
+ * DatabasePath, etc aren't valid, we require the caller to pass in suitable
+ * values.  The caller is also responsible for being sure no concurrent
+ * map update could be happening.
+ */
+static void
+write_relmap_file(bool shared, RelMapFile *newmap,
+				  bool write_wal, bool send_sinval, bool preserve_files,
+				  Oid dbid, Oid tsid, const char *dbpath)
+{
+	int			fd;
+	RelMapFile *realmap;
+	char		mapfilename[MAXPGPATH];
+
+	/*
+	 * Fill in the overhead fields and update CRC.
+	 */
+	newmap->magic = RELMAPPER_FILEMAGIC;
+	if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
+		elog(ERROR, "attempt to write bogus relation mapping");
+
+	INIT_CRC32(newmap->crc);
+	COMP_CRC32(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
+	FIN_CRC32(newmap->crc);
+
+	/*
+	 * Open the target file.  We prefer to do this before entering the
+	 * critical section, so that an open() failure need not force PANIC.
+	 *
+	 * Note: since we use BasicOpenFile, we are nominally responsible for
+	 * ensuring the fd is closed on error.  In practice, this isn't important
+	 * because either an error happens inside the critical section, or we
+	 * are in bootstrap or WAL replay; so an error past this point is always
+	 * fatal anyway.
+	 */
+	if (shared)
+	{
+		snprintf(mapfilename, sizeof(mapfilename), "global/%s",
+				 RELMAPPER_FILENAME);
+		realmap = &shared_map;
+	}
+	else
+	{
+		snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
+				 dbpath, RELMAPPER_FILENAME);
+		realmap = &local_map;
+	}
+
+	fd = BasicOpenFile(mapfilename,
+					   O_WRONLY | O_CREAT | PG_BINARY,
+					   S_IRUSR | S_IWUSR);
+	if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open relation mapping file \"%s\": %m",
+						mapfilename)));
+
+	if (write_wal)
+	{
+		xl_relmap_update xlrec;
+		XLogRecData rdata[2];
+		XLogRecPtr	lsn;
+
+		/* now errors are fatal ... */
+		START_CRIT_SECTION();
+
+		xlrec.dbid = dbid;
+		xlrec.tsid = tsid;
+		xlrec.nbytes = sizeof(RelMapFile);
+
+		rdata[0].data = (char *) (&xlrec);
+		rdata[0].len = MinSizeOfRelmapUpdate;
+		rdata[0].buffer = InvalidBuffer;
+		rdata[0].next = &(rdata[1]);
+		rdata[1].data = (char *) newmap;
+		rdata[1].len = sizeof(RelMapFile);
+		rdata[1].buffer = InvalidBuffer;
+		rdata[1].next = NULL;
+
+		lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE, rdata);
+
+		/* As always, WAL must hit the disk before the data update does */
+		XLogFlush(lsn);
+	}
+
+	errno = 0;
+	if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
+	{
+		/* if write didn't set errno, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to relation mapping file \"%s\": %m",
+						mapfilename)));
+	}
+
+	/*
+	 * We choose to fsync the data to disk before considering the task done.
+	 * It would be possible to relax this if it turns out to be a performance
+	 * issue, but it would complicate checkpointing --- see notes for
+	 * CheckPointRelationMap.
+	 */
+	if (pg_fsync(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync relation mapping file \"%s\": %m",
+						mapfilename)));
+
+	if (close(fd))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close relation mapping file \"%s\": %m",
+						mapfilename)));
+
+	/*
+	 * Now that the file is safely on disk, send sinval message to let other
+	 * backends know to re-read it.  We must do this inside the critical
+	 * section: if for some reason we fail to send the message, we have to
+	 * force a database-wide PANIC.  Otherwise other backends might continue
+	 * execution with stale mapping information, which would be catastrophic
+	 * as soon as others began to use the now-committed data.
+	 */
+	if (send_sinval)
+		CacheInvalidateRelmap(dbid);
+
+	/*
+	 * Make sure that the files listed in the map are not deleted if the
+	 * outer transaction aborts.  This had better be within the critical
+	 * section too: it's not likely to fail, but if it did, we'd arrive
+	 * at transaction abort with the files still vulnerable.  PANICing
+	 * will leave things in a good state on-disk.
+	 *
+	 * Note: we're cheating a little bit here by assuming that mapped files
+	 * are either in pg_global or the database's default tablespace.
+	 */
+	if (preserve_files)
+	{
+		int32		i;
+
+		for (i = 0; i < newmap->num_mappings; i++)
+		{
+			RelFileNode rnode;
+
+			rnode.spcNode = tsid;
+			rnode.dbNode = dbid;
+			rnode.relNode = newmap->mappings[i].mapfilenode;
+			RelationPreserveStorage(rnode);
+		}
+	}
+
+	/* Success, update permanent copy */
+	memcpy(realmap, newmap, sizeof(RelMapFile));
+
+	/* Critical section done */
+	if (write_wal)
+		END_CRIT_SECTION();
+}
+
+/*
+ * Merge the specified updates into the appropriate "real" map,
+ * and write out the changes.  This function must be used for committing
+ * updates during normal multiuser operation.
+ */
+static void
+perform_relmap_update(bool shared, const RelMapFile *updates)
+{
+	RelMapFile	newmap;
+
+	/*
+	 * Anyone updating a relation's mapping info should take exclusive lock
+	 * on that rel and hold it until commit.  This ensures that there will
+	 * not be concurrent updates on the same mapping value; but there could
+	 * easily be concurrent updates on different values in the same file.
+	 * We cover that by acquiring the RelationMappingLock, re-reading the
+	 * target file to ensure it's up to date, applying the updates, and
+	 * writing the data before releasing RelationMappingLock.
+	 *
+	 * There is only one RelationMappingLock.  In principle we could try to
+	 * have one per mapping file, but it seems unlikely to be worth the
+	 * trouble.
+	 */
+	LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
+
+	/* Be certain we see any other updates just made */
+	load_relmap_file(shared);
+
+	/* Prepare updated data in a local variable */
+	if (shared)
+		memcpy(&newmap, &shared_map, sizeof(RelMapFile));
+	else
+		memcpy(&newmap, &local_map, sizeof(RelMapFile));
+
+	/* Apply the updates to newmap.  No new mappings should appear. */
+	merge_map_updates(&newmap, updates, false);
+
+	/* Write out the updated map and do other necessary tasks */
+	write_relmap_file(shared, &newmap, true, true, true,
+					  (shared ? InvalidOid : MyDatabaseId),
+					  (shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace),
+					  DatabasePath);
+
+	/* Now we can release the lock */
+	LWLockRelease(RelationMappingLock);
+}
+
+/*
+ * RELMAP resource manager's routines
+ */
+void
+relmap_redo(XLogRecPtr lsn, XLogRecord *record)
+{
+	uint8		info = record->xl_info & ~XLR_INFO_MASK;
+
+	/* Backup blocks are not used in relmap records */
+	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
+
+	if (info == XLOG_RELMAP_UPDATE)
+	{
+		xl_relmap_update *xlrec = (xl_relmap_update *) XLogRecGetData(record);
+		RelMapFile newmap;
+		char   *dbpath;
+
+		if (xlrec->nbytes != sizeof(RelMapFile))
+			elog(PANIC, "relmap_redo: wrong size %u in relmap update record",
+				 xlrec->nbytes);
+		memcpy(&newmap, xlrec->data, sizeof(newmap));
+
+		/* We need to construct the pathname for this database */
+		dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid);
+
+		/*
+		 * Write out the new map and send sinval, but of course don't
+		 * write a new WAL entry.  There's no surrounding transaction
+		 * to tell to preserve files, either.
+		 *
+		 * There shouldn't be anyone else updating relmaps during WAL replay,
+		 * so we don't bother to take the RelationMappingLock.  We would
+		 * need to do so if load_relmap_file needed to interlock against
+		 * writers.
+		 */
+		write_relmap_file((xlrec->dbid == InvalidOid), &newmap,
+						  false, true, false,
+						  xlrec->dbid, xlrec->tsid, dbpath);
+
+		pfree(dbpath);
+	}
+	else
+		elog(PANIC, "relmap_redo: unknown op code %u", info);
+}
+
+void
+relmap_desc(StringInfo buf, uint8 xl_info, char *rec)
+{
+	uint8		info = xl_info & ~XLR_INFO_MASK;
+
+	if (info == XLOG_RELMAP_UPDATE)
+	{
+		xl_relmap_update *xlrec = (xl_relmap_update *) rec;
+
+		appendStringInfo(buf, "update relmap: database %u tablespace %u size %u",
+						 xlrec->dbid, xlrec->tsid, xlrec->nbytes);
+	}
+	else
+		appendStringInfo(buf, "UNKNOWN");
+}
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 21664d8c7e..31bdc65ec1 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/init/miscinit.c,v 1.180 2010/01/02 16:57:56 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/init/miscinit.c,v 1.181 2010/02/07 20:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -64,62 +64,6 @@ static char socketLockFile[MAXPGPATH];
 
 bool		IgnoreSystemIndexes = false;
 
-/* ----------------------------------------------------------------
- *		system index reindexing support
- *
- * When we are busy reindexing a system index, this code provides support
- * for preventing catalog lookups from using that index.
- * ----------------------------------------------------------------
- */
-
-static Oid	currentlyReindexedHeap = InvalidOid;
-static Oid	currentlyReindexedIndex = InvalidOid;
-
-/*
- * ReindexIsProcessingHeap
- *		True if heap specified by OID is currently being reindexed.
- */
-bool
-ReindexIsProcessingHeap(Oid heapOid)
-{
-	return heapOid == currentlyReindexedHeap;
-}
-
-/*
- * ReindexIsProcessingIndex
- *		True if index specified by OID is currently being reindexed.
- */
-bool
-ReindexIsProcessingIndex(Oid indexOid)
-{
-	return indexOid == currentlyReindexedIndex;
-}
-
-/*
- * SetReindexProcessing
- *		Set flag that specified heap/index are being reindexed.
- */
-void
-SetReindexProcessing(Oid heapOid, Oid indexOid)
-{
-	Assert(OidIsValid(heapOid) && OidIsValid(indexOid));
-	/* Reindexing is not re-entrant. */
-	if (OidIsValid(currentlyReindexedIndex))
-		elog(ERROR, "cannot reindex while reindexing");
-	currentlyReindexedHeap = heapOid;
-	currentlyReindexedIndex = indexOid;
-}
-
-/*
- * ResetReindexProcessing
- *		Unset reindexing status.
- */
-void
-ResetReindexProcessing(void)
-{
-	currentlyReindexedHeap = InvalidOid;
-	currentlyReindexedIndex = InvalidOid;
-}
 
 /* ----------------------------------------------------------------
  *				database path / name support stuff
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index ae04b6f287..538a518ef5 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -12,7 +12,7 @@
  *	by PostgreSQL
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/bin/pg_dump/pg_dump.c,v 1.569 2010/01/28 23:21:12 petere Exp $
+ *	  $PostgreSQL: pgsql/src/bin/pg_dump/pg_dump.c,v 1.570 2010/02/07 20:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -2300,6 +2300,12 @@ binary_upgrade_set_relfilenodes(PQExpBuffer upgrade_buffer, Oid pg_class_oid,
 	Oid			pg_class_reltoastrelid;
 	Oid			pg_class_reltoastidxid;
 
+	/*
+	 * Note: we don't need to use pg_relation_filenode() here because this
+	 * function is not intended to be used against system catalogs.
+	 * Otherwise we'd have to worry about which versions pg_relation_filenode
+	 * is available in.
+	 */
 	appendPQExpBuffer(upgrade_query,
 					  "SELECT c.relfilenode, c.reltoastrelid, t.reltoastidxid "
 					  "FROM pg_catalog.pg_class c LEFT JOIN "
diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h
index 32b1bd535c..72ee757f70 100644
--- a/src/include/access/rmgr.h
+++ b/src/include/access/rmgr.h
@@ -3,7 +3,7 @@
  *
  * Resource managers definition
  *
- * $PostgreSQL: pgsql/src/include/access/rmgr.h,v 1.20 2009/12/19 01:32:42 sriggs Exp $
+ * $PostgreSQL: pgsql/src/include/access/rmgr.h,v 1.21 2010/02/07 20:48:11 tgl Exp $
  */
 #ifndef RMGR_H
 #define RMGR_H
@@ -23,6 +23,7 @@ typedef uint8 RmgrId;
 #define RM_DBASE_ID				4
 #define RM_TBLSPC_ID			5
 #define RM_MULTIXACT_ID			6
+#define RM_RELMAP_ID			7
 #define RM_STANDBY_ID			8
 #define RM_HEAP2_ID				9
 #define RM_HEAP_ID				10
diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h
index 236983fe07..b8401df772 100644
--- a/src/include/catalog/catalog.h
+++ b/src/include/catalog/catalog.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/catalog.h,v 1.47 2010/01/12 02:42:52 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catalog.h,v 1.48 2010/02/07 20:48:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -45,7 +45,6 @@ extern bool IsSharedRelation(Oid relationId);
 extern Oid	GetNewOid(Relation relation);
 extern Oid GetNewOidWithIndex(Relation relation, Oid indexId,
 				   AttrNumber oidcolumn);
-extern Oid GetNewRelFileNode(Oid reltablespace, bool relisshared,
-				  Relation pg_class);
+extern Oid GetNewRelFileNode(Oid reltablespace, Relation pg_class);
 
 #endif   /* CATALOG_H */
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index e12ec58ed6..4a4ea6b492 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -37,7 +37,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.582 2010/02/01 03:14:43 itagaki Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.583 2010/02/07 20:48:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201002011
+#define CATALOG_VERSION_NO	201002071
 
 #endif
diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h
index 9c16737ada..d733dbb32e 100644
--- a/src/include/catalog/heap.h
+++ b/src/include/catalog/heap.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/heap.h,v 1.96 2010/01/28 23:21:12 petere Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/heap.h,v 1.97 2010/02/07 20:48:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -41,6 +41,7 @@ extern Relation heap_create(const char *relname,
 			TupleDesc tupDesc,
 			char relkind,
 			bool shared_relation,
+			bool mapped_relation,
 			bool allow_system_table_mods);
 
 extern Oid heap_create_with_catalog(const char *relname,
@@ -54,6 +55,7 @@ extern Oid heap_create_with_catalog(const char *relname,
 						 List *cooked_constraints,
 						 char relkind,
 						 bool shared_relation,
+						 bool mapped_relation,
 						 bool oidislocal,
 						 int oidinhcount,
 						 OnCommitAction oncommit,
@@ -109,8 +111,10 @@ extern Form_pg_attribute SystemAttributeDefinition(AttrNumber attno,
 extern Form_pg_attribute SystemAttributeByName(const char *attname,
 					  bool relhasoids);
 
-extern void CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind);
+extern void CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind,
+									 bool allow_system_table_mods);
 
-extern void CheckAttributeType(const char *attname, Oid atttypid);
+extern void CheckAttributeType(const char *attname, Oid atttypid,
+							   bool allow_system_table_mods);
 
 #endif   /* HEAP_H */
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index bdb1c71a73..2bacf827c9 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/index.h,v 1.81 2010/02/03 01:14:17 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/index.h,v 1.82 2010/02/07 20:48:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -71,6 +71,9 @@ extern double IndexBuildHeapScan(Relation heapRelation,
 extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot);
 
 extern void reindex_index(Oid indexId);
-extern bool reindex_relation(Oid relid, bool toast_too);
+extern bool reindex_relation(Oid relid, bool toast_too, bool heap_rebuilt);
+
+extern bool ReindexIsProcessingHeap(Oid heapOid);
+extern bool ReindexIsProcessingIndex(Oid indexOid);
 
 #endif   /* INDEX_H */
diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h
index aa35109edc..00d0dbc975 100644
--- a/src/include/catalog/pg_class.h
+++ b/src/include/catalog/pg_class.h
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/pg_class.h,v 1.120 2010/01/28 23:21:12 petere Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_class.h,v 1.121 2010/02/07 20:48:11 tgl Exp $
  *
  * NOTES
  *	  the genbki.pl script reads this file and generates .bki
@@ -38,6 +38,7 @@ CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83) BKI_SCHEMA_MACRO
 	Oid			relowner;		/* class owner */
 	Oid			relam;			/* index access method; 0 if not an index */
 	Oid			relfilenode;	/* identifier of physical storage file */
+	/* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */
 	Oid			reltablespace;	/* identifier of table space for relation */
 	int4		relpages;		/* # of blocks (not always up-to-date) */
 	float4		reltuples;		/* # of tuples (not always up-to-date) */
@@ -128,13 +129,13 @@ typedef FormData_pg_class *Form_pg_class;
  */
 
 /* Note: "3" in the relfrozenxid column stands for FirstNormalTransactionId */
-DATA(insert OID = 1247 (  pg_type		PGNSP 71 0 PGUID 0 1247 0 0 0 0 0 f f f r 28 0 t f f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1247 (  pg_type		PGNSP 71 0 PGUID 0 0 0 0 0 0 0 f f f r 28 0 t f f f f f 3 _null_ _null_ ));
 DESCR("");
-DATA(insert OID = 1249 (  pg_attribute	PGNSP 75 0 PGUID 0 1249 0 0 0 0 0 f f f r 19 0 f f f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1249 (  pg_attribute	PGNSP 75 0 PGUID 0 0 0 0 0 0 0 f f f r 19 0 f f f f f f 3 _null_ _null_ ));
 DESCR("");
-DATA(insert OID = 1255 (  pg_proc		PGNSP 81 0 PGUID 0 1255 0 0 0 0 0 f f f r 25 0 t f f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1255 (  pg_proc		PGNSP 81 0 PGUID 0 0 0 0 0 0 0 f f f r 25 0 t f f f f f 3 _null_ _null_ ));
 DESCR("");
-DATA(insert OID = 1259 (  pg_class		PGNSP 83 0 PGUID 0 1259 0 0 0 0 0 f f f r 27 0 t f f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1259 (  pg_class		PGNSP 83 0 PGUID 0 0 0 0 0 0 0 f f f r 27 0 t f f f f f 3 _null_ _null_ ));
 DESCR("");
 
 #define		  RELKIND_INDEX			  'i'		/* secondary index */
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index ec45367b4a..727b13e264 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.567 2010/02/01 03:14:44 itagaki Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.568 2010/02/07 20:48:11 tgl Exp $
  *
  * NOTES
  *	  The script catalog/genbki.pl reads this file and generates .bki
@@ -3741,6 +3741,10 @@ DATA(insert OID = 2997 ( pg_table_size			PGNSP PGUID 12 1 0 0 f f f t f v 1 0 20
 DESCR("disk space usage for the specified table, including TOAST, free space and visibility map");
 DATA(insert OID = 2998 ( pg_indexes_size		PGNSP PGUID 12 1 0 0 f f f t f v 1 0 20 "2205" _null_ _null_ _null_ _null_ pg_indexes_size _null_ _null_ _null_ ));
 DESCR("disk space usage for all indexes attached to the specified table");
+DATA(insert OID = 2999 ( pg_relation_filenode	PGNSP PGUID 12 1 0 0 f f f t f s 1 0 26 "2205" _null_ _null_ _null_ _null_ pg_relation_filenode _null_ _null_ _null_ ));
+DESCR("filenode identifier of relation");
+DATA(insert OID = 3034 ( pg_relation_filepath	PGNSP PGUID 12 1 0 0 f f f t f s 1 0 25 "2205" _null_ _null_ _null_ _null_ pg_relation_filepath _null_ _null_ _null_ ));
+DESCR("file path of relation");
 
 DATA(insert OID = 2316 ( postgresql_fdw_validator PGNSP PGUID 12 1 0 0 f f f t f i 2 0 16 "1009 26" _null_ _null_ _null_ _null_ postgresql_fdw_validator _null_ _null_ _null_));
 
diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h
index e6cafd8216..f86cf9bbf5 100644
--- a/src/include/catalog/storage.h
+++ b/src/include/catalog/storage.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/storage.h,v 1.4 2010/01/02 16:58:02 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/storage.h,v 1.5 2010/02/07 20:48:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -22,6 +22,7 @@
 
 extern void RelationCreateStorage(RelFileNode rnode, bool istemp);
 extern void RelationDropStorage(Relation rel);
+extern void RelationPreserveStorage(RelFileNode rnode);
 extern void RelationTruncate(Relation rel, BlockNumber nblocks);
 
 /*
diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h
index f535781436..0fecd1986a 100644
--- a/src/include/commands/cluster.h
+++ b/src/include/commands/cluster.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994-5, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/commands/cluster.h,v 1.39 2010/02/04 00:09:14 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/commands/cluster.h,v 1.40 2010/02/07 20:48:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -25,9 +25,9 @@ extern void check_index_is_clusterable(Relation OldHeap, Oid indexOid,
 extern void mark_index_clustered(Relation rel, Oid indexOid);
 
 extern Oid	make_new_heap(Oid OIDOldHeap, Oid NewTableSpace);
-extern void swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content,
-					TransactionId frozenXid);
-extern void cleanup_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
-							  bool swap_toast_by_content);
+extern void finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
+							 bool is_system_catalog,
+							 bool swap_toast_by_content,
+							 TransactionId frozenXid);
 
 #endif   /* CLUSTER_H */
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index f47106f6cc..2face3a3bd 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -13,7 +13,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/miscadmin.h,v 1.217 2010/01/02 16:58:00 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/miscadmin.h,v 1.218 2010/02/07 20:48:13 tgl Exp $
  *
  * NOTES
  *	  some of the information in this file should be moved to other files.
@@ -347,10 +347,6 @@ extern PGDLLIMPORT bool process_shared_preload_libraries_in_progress;
 extern char *shared_preload_libraries_string;
 extern char *local_preload_libraries_string;
 
-extern void SetReindexProcessing(Oid heapOid, Oid indexOid);
-extern void ResetReindexProcessing(void);
-extern bool ReindexIsProcessingHeap(Oid heapOid);
-extern bool ReindexIsProcessingIndex(Oid indexOid);
 extern void CreateDataDirLockFile(bool amPostmaster);
 extern void CreateSocketLockFile(const char *socketfile, bool amPostmaster);
 extern void TouchSocketLockFile(void);
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 74e98eca54..f0beb20a24 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.43 2010/01/02 16:58:08 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.44 2010/02/07 20:48:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -67,6 +67,7 @@ typedef enum LWLockId
 	AutovacuumLock,
 	AutovacuumScheduleLock,
 	SyncScanLock,
+	RelationMappingLock,
 	/* Individual lock IDs end here */
 	FirstBufMappingLock,
 	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h
index e2088270d0..b5e4e1134d 100644
--- a/src/include/storage/relfilenode.h
+++ b/src/include/storage/relfilenode.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/relfilenode.h,v 1.24 2010/01/02 16:58:08 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/relfilenode.h,v 1.25 2010/02/07 20:48:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -61,6 +61,10 @@ typedef enum ForkNumber
  * identified by pg_database.dattablespace).  However this shorthand
  * is NOT allowed in RelFileNode structs --- the real tablespace ID
  * must be supplied when setting spcNode.
+ *
+ * Note: in pg_class, relfilenode can be zero to denote that the relation
+ * is a "mapped" relation, whose current true filenode number is available
+ * from relmapper.c.  Again, this case is NOT allowed in RelFileNodes.
  */
 typedef struct RelFileNode
 {
diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h
index 9f7bb2b2ee..bad8f50542 100644
--- a/src/include/storage/sinval.h
+++ b/src/include/storage/sinval.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/sinval.h,v 1.56 2010/01/09 16:49:27 sriggs Exp $
+ * $PostgreSQL: pgsql/src/include/storage/sinval.h,v 1.57 2010/02/07 20:48:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,14 +19,16 @@
 
 
 /*
- * We currently support three types of shared-invalidation messages: one that
- * invalidates an entry in a catcache, one that invalidates a relcache entry,
- * and one that invalidates an smgr cache entry.  More types could be added
- * if needed.  The message type is identified by the first "int16" field of
- * the message struct.	Zero or positive means a catcache inval message (and
- * also serves as the catcache ID field).  -1 means a relcache inval message.
- * -2 means an smgr inval message.	Other negative values are available to
- * identify other inval message types.
+ * We support several types of shared-invalidation messages:
+ *	* invalidate a specific tuple in a specific catcache
+ *	* invalidate all catcache entries from a given system catalog
+ *	* invalidate a relcache entry for a specific logical relation
+ *	* invalidate an smgr cache entry for a specific physical relation
+ *	* invalidate the mapped-relation mapping for a given database
+ * More types could be added if needed.  The message type is identified by
+ * the first "int16" field of the message struct.  Zero or positive means a
+ * specific-catcache inval message (and also serves as the catcache ID field).
+ * Negative values identify the other message types, as per codes below.
  *
  * Catcache inval events are initially driven by detecting tuple inserts,
  * updates and deletions in system catalogs (see CacheInvalidateHeapTuple).
@@ -46,6 +48,16 @@
  * and so that negative cache entries can be recognized with good accuracy.
  * (Of course this assumes that all the backends are using identical hashing
  * code, but that should be OK.)
+ *
+ * Catcache and relcache invalidations are transactional, and so are sent
+ * to other backends upon commit.  Internally to the generating backend,
+ * they are also processed at CommandCounterIncrement so that later commands
+ * in the same transaction see the new state.  The generating backend also
+ * has to process them at abort, to flush out any cache state it's loaded
+ * from no-longer-valid entries.
+ *
+ * smgr and relation mapping invalidations are non-transactional: they are
+ * sent immediately when the underlying file change is made.
  */
 
 typedef struct
@@ -57,7 +69,16 @@ typedef struct
 	uint32		hashValue;		/* hash value of key for this catcache */
 } SharedInvalCatcacheMsg;
 
-#define SHAREDINVALRELCACHE_ID	(-1)
+#define SHAREDINVALCATALOG_ID	(-1)
+
+typedef struct
+{
+	int16		id;				/* type field --- must be first */
+	Oid			dbId;			/* database ID, or 0 if a shared catalog */
+	Oid			catId;			/* ID of catalog whose contents are invalid */
+} SharedInvalCatalogMsg;
+
+#define SHAREDINVALRELCACHE_ID	(-2)
 
 typedef struct
 {
@@ -66,7 +87,7 @@ typedef struct
 	Oid			relId;			/* relation ID */
 } SharedInvalRelcacheMsg;
 
-#define SHAREDINVALSMGR_ID		(-2)
+#define SHAREDINVALSMGR_ID		(-3)
 
 typedef struct
 {
@@ -74,12 +95,22 @@ typedef struct
 	RelFileNode rnode;			/* physical file ID */
 } SharedInvalSmgrMsg;
 
+#define SHAREDINVALRELMAP_ID	(-4)
+
+typedef struct
+{
+	int16		id;				/* type field --- must be first */
+	Oid			dbId;			/* database ID, or 0 for shared catalogs */
+} SharedInvalRelmapMsg;
+
 typedef union
 {
 	int16		id;				/* type field --- must be first */
 	SharedInvalCatcacheMsg cc;
+	SharedInvalCatalogMsg cat;
 	SharedInvalRelcacheMsg rc;
 	SharedInvalSmgrMsg sm;
+	SharedInvalRelmapMsg rm;
 } SharedInvalidationMessage;
 
 
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 6381c3d735..a6a4284b44 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.346 2010/02/01 03:14:45 itagaki Exp $
+ * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.347 2010/02/07 20:48:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -435,6 +435,8 @@ extern Datum pg_total_relation_size(PG_FUNCTION_ARGS);
 extern Datum pg_size_pretty(PG_FUNCTION_ARGS);
 extern Datum pg_table_size(PG_FUNCTION_ARGS);
 extern Datum pg_indexes_size(PG_FUNCTION_ARGS);
+extern Datum pg_relation_filenode(PG_FUNCTION_ARGS);
+extern Datum pg_relation_filepath(PG_FUNCTION_ARGS);
 
 /* genfile.c */
 extern Datum pg_stat_file(PG_FUNCTION_ARGS);
diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h
index b8e945e8cc..6d77c4a7d1 100644
--- a/src/include/utils/catcache.h
+++ b/src/include/utils/catcache.h
@@ -13,7 +13,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/catcache.h,v 1.69 2010/01/02 16:58:10 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/utils/catcache.h,v 1.70 2010/02/07 20:48:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -180,6 +180,7 @@ extern void ReleaseCatCacheList(CatCList *list);
 
 extern void ResetCatalogCaches(void);
 extern void CatalogCacheFlushRelation(Oid relId);
+extern void CatalogCacheFlushCatalog(Oid catId);
 extern void CatalogCacheIdInvalidate(int cacheId, uint32 hashValue,
 						 ItemPointer pointer);
 extern void PrepareToInvalidateCacheTuple(Relation relation,
diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h
index dc35160ffe..1a9bbe5b38 100644
--- a/src/include/utils/inval.h
+++ b/src/include/utils/inval.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/inval.h,v 1.47 2010/02/03 01:14:17 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/inval.h,v 1.48 2010/02/07 20:48:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -45,6 +45,8 @@ extern void EndNonTransactionalInvalidation(void);
 
 extern void CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple);
 
+extern void CacheInvalidateCatalog(Oid catalogId);
+
 extern void CacheInvalidateRelcache(Relation relation);
 
 extern void CacheInvalidateRelcacheByTuple(HeapTuple classTuple);
@@ -53,6 +55,8 @@ extern void CacheInvalidateRelcacheByRelid(Oid relid);
 
 extern void CacheInvalidateSmgr(RelFileNode rnode);
 
+extern void CacheInvalidateRelmap(Oid databaseId);
+
 extern void CacheRegisterSyscacheCallback(int cacheid,
 							  SyscacheCallbackFunction func,
 							  Datum arg);
@@ -60,6 +64,8 @@ extern void CacheRegisterSyscacheCallback(int cacheid,
 extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
 							  Datum arg);
 
+extern void CallSyscacheCallbacks(int cacheid, ItemPointer tuplePtr);
+
 extern void inval_twophase_postcommit(TransactionId xid, uint16 info,
 						  void *recdata, uint32 len);
 
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 3f5795d0ea..c4a1fcf7b6 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.121 2010/02/04 00:09:14 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.122 2010/02/07 20:48:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -339,6 +339,16 @@ typedef struct StdRdOptions
 #define RelationGetNamespace(relation) \
 	((relation)->rd_rel->relnamespace)
 
+/*
+ * RelationIsMapped
+ *		True if the relation uses the relfilenode map.
+ *
+ * NB: this is only meaningful for relkinds that have storage, else it
+ * will misleadingly say "true".
+ */
+#define RelationIsMapped(relation) \
+	((relation)->rd_rel->relfilenode == InvalidOid)
+
 /*
  * RelationOpenSmgr
  *		Open the relation at the smgr level, if not already done.
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index 2e48250cbf..74d6af01ba 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.67 2010/02/03 01:14:17 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.68 2010/02/07 20:48:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -68,7 +68,8 @@ extern Relation RelationBuildLocalRelation(const char *relname,
 						   TupleDesc tupDesc,
 						   Oid relid,
 						   Oid reltablespace,
-						   bool shared_relation);
+						   bool shared_relation,
+						   bool mapped_relation);
 
 /*
  * Routine to manage assignment of new relfilenode to a relation
@@ -85,6 +86,8 @@ extern void RelationCacheInvalidateEntry(Oid relationId);
 
 extern void RelationCacheInvalidate(void);
 
+extern void RelationCloseSmgrByOid(Oid relationId);
+
 extern void AtEOXact_RelationCache(bool isCommit);
 extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
 						  SubTransactionId parentSubid);
diff --git a/src/include/utils/relmapper.h b/src/include/utils/relmapper.h
new file mode 100644
index 0000000000..6bd1f6ba40
--- /dev/null
+++ b/src/include/utils/relmapper.h
@@ -0,0 +1,62 @@
+/*-------------------------------------------------------------------------
+ *
+ * relmapper.h
+ *	  Catalog-to-filenode mapping
+ *
+ *
+ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * $PostgreSQL: pgsql/src/include/utils/relmapper.h,v 1.1 2010/02/07 20:48:13 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RELMAPPER_H
+#define RELMAPPER_H
+
+#include "access/xlog.h"
+
+/* ----------------
+ *		relmap-related XLOG entries
+ * ----------------
+ */
+
+#define XLOG_RELMAP_UPDATE		0x00
+
+typedef struct xl_relmap_update
+{
+	Oid			dbid;			/* database ID, or 0 for shared map */
+	Oid			tsid;			/* database's tablespace, or pg_global */
+	int32		nbytes;			/* size of relmap data */
+	char		data[1];		/* VARIABLE LENGTH ARRAY */
+} xl_relmap_update;
+
+#define MinSizeOfRelmapUpdate offsetof(xl_relmap_update, data)
+
+
+extern Oid	RelationMapOidToFilenode(Oid relationId, bool shared);
+
+extern void RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared,
+								 bool immediate);
+
+extern void RelationMapRemoveMapping(Oid relationId);
+
+extern void RelationMapInvalidate(bool shared);
+extern void RelationMapInvalidateAll(void);
+
+extern void AtCCI_RelationMap(void);
+extern void AtEOXact_RelationMap(bool isCommit);
+extern void AtPrepare_RelationMap(void);
+
+extern void CheckPointRelationMap(void);
+
+extern void RelationMapFinishBootstrap(void);
+
+extern void RelationMapInitialize(void);
+extern void RelationMapInitializePhase2(void);
+extern void RelationMapInitializePhase3(void);
+
+extern void relmap_redo(XLogRecPtr lsn, XLogRecord *record);
+extern void relmap_desc(StringInfo buf, uint8 xl_info, char *rec);
+
+#endif   /* RELMAPPER_H */
diff --git a/src/test/regress/expected/vacuum.out b/src/test/regress/expected/vacuum.out
index e94e1d538f..40db5df47e 100644
--- a/src/test/regress/expected/vacuum.out
+++ b/src/test/regress/expected/vacuum.out
@@ -108,7 +108,7 @@ SELECT relid,
  ORDER BY relid::text;
     relid    | cluster | full_inplace | full 
 -------------+---------+--------------+------
- pg_am       | t       | t            | t
+ pg_am       | t       | t            | f
  pg_class    | t       | t            | t
  pg_database | t       | t            | t
  vaccluster  | f       | t            | f
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index fa5f507e45..eb53eff4b4 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -1,5 +1,5 @@
 # ----------
-# $PostgreSQL: pgsql/src/test/regress/parallel_schedule,v 1.58 2010/01/28 23:21:13 petere Exp $
+# $PostgreSQL: pgsql/src/test/regress/parallel_schedule,v 1.59 2010/02/07 20:48:13 tgl Exp $
 #
 # By convention, we put no more than twenty tests in any one parallel group;
 # this limits the number of connections needed to run the tests.
@@ -52,7 +52,10 @@ test: copy copyselect
 # ----------
 # Another group of parallel tests
 # ----------
-test: constraints triggers create_misc create_aggregate create_operator inherit typed_table vacuum drop_if_exists create_cast
+test: constraints triggers create_misc create_aggregate create_operator inherit typed_table drop_if_exists create_cast
+
+# XXX temporarily run this by itself
+test: vacuum
 
 # Depends on the above
 test: create_index create_view
-- 
2.50.0