Improve vacuum code to track minimum Xids per table instead of per database.

[postgresql] / src / backend / utils / init / flatfiles.c
diff --git a/src/backend/utils/init/flatfiles.c b/src/backend/utils/init/flatfiles.c

index e58c1102c253e4104908e0f238510a416a4e7444..4669fb9e9d4b96d862d0cae50873be7dc22c2099 100644 (file)
--- a/src/backend/utils/init/flatfiles.c
+++ b/src/backend/utils/init/flatfiles.c
@@ -4,9 +4,10 @@
   *       Routines for maintaining "flat file" images of the shared catalogs.
   *
   * We use flat files so that the postmaster and not-yet-fully-started
- * backends can look at the contents of pg_database, pg_shadow, and pg_group
- * for authentication purposes.  This module is responsible for keeping the
- * flat-file images as nearly in sync with database reality as possible.
+ * backends can look at the contents of pg_database, pg_authid, and
+ * pg_auth_members for authentication purposes.  This module is
+ * responsible for keeping the flat-file images as nearly in sync with
+ * database reality as possible.
   *
   * The tricky part of the write_xxx_file() routines in this module is that
   * they need to be able to operate in the context of the database startup
@@ -19,10 +20,10 @@
   * a way that this is OK.
   *
   *
- * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/backend/utils/init/flatfiles.c,v 1.6 2005/04/14 20:03:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/init/flatfiles.c,v 1.19 2006/07/10 16:20:51 alvherre Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -32,10 +33,11 @@
  #include <unistd.h>
  
  #include "access/heapam.h"
+#include "access/twophase_rmgr.h"
+#include "catalog/pg_auth_members.h"
+#include "catalog/pg_authid.h"
  #include "catalog/pg_database.h"
-#include "catalog/pg_group.h"
  #include "catalog/pg_namespace.h"
-#include "catalog/pg_shadow.h"
  #include "catalog/pg_tablespace.h"
  #include "commands/trigger.h"
  #include "miscadmin.h"
@@ -48,9 +50,13 @@
  #include "utils/syscache.h"
  
  
-#define DATABASE_FLAT_FILE     "pg_database"
-#define GROUP_FLAT_FILE                "pg_group"
-#define USER_FLAT_FILE         "pg_pwd"
+/* Actual names of the flat files (within $PGDATA) */
+#define DATABASE_FLAT_FILE     "global/pg_database"
+#define AUTH_FLAT_FILE         "global/pg_auth"
+
+/* Info bits in a flatfiles 2PC record */
+#define FF_BIT_DATABASE 1
+#define FF_BIT_AUTH            2
  
  
  /*
@@ -66,8 +72,7 @@
   * SubTransactionId is seen at top-level commit.
   */
  static SubTransactionId database_file_update_subid = InvalidSubTransactionId;
-static SubTransactionId group_file_update_subid = InvalidSubTransactionId;
-static SubTransactionId user_file_update_subid = InvalidSubTransactionId;
+static SubTransactionId auth_file_update_subid = InvalidSubTransactionId;
  
  
  /*
@@ -81,81 +86,41 @@ database_file_update_needed(void)
  }
  
  /*
- * Mark flat group file as needing an update (because pg_group changed)
- */
-void
-group_file_update_needed(void)
-{
-       if (group_file_update_subid == InvalidSubTransactionId)
-               group_file_update_subid = GetCurrentSubTransactionId();
-}
-
-/*
- * Mark flat user file as needing an update (because pg_shadow changed)
+ * Mark flat auth file as needing an update (because pg_authid or
+ * pg_auth_members changed)
   */
  void
-user_file_update_needed(void)
+auth_file_update_needed(void)
  {
-       if (user_file_update_subid == InvalidSubTransactionId)
-               user_file_update_subid = GetCurrentSubTransactionId();
+       if (auth_file_update_subid == InvalidSubTransactionId)
+               auth_file_update_subid = GetCurrentSubTransactionId();
  }
  
  
  /*
- * database_getflatfilename --- get full pathname of database file
+ * database_getflatfilename --- get pathname of database file
   *
   * Note that result string is palloc'd, and should be freed by the caller.
+ * (This convention is not really needed anymore, since the relative path
+ * is fixed.)
   */
  char *
  database_getflatfilename(void)
  {
-       int                     bufsize;
-       char       *pfnam;
-
-       bufsize = strlen(DataDir) + strlen("/global/") +
-               strlen(DATABASE_FLAT_FILE) + 1;
-       pfnam = (char *) palloc(bufsize);
-       snprintf(pfnam, bufsize, "%s/global/%s", DataDir, DATABASE_FLAT_FILE);
-
-       return pfnam;
+       return pstrdup(DATABASE_FLAT_FILE);
  }
  
  /*
- * group_getflatfilename --- get full pathname of group file
+ * auth_getflatfilename --- get pathname of auth file
   *
   * Note that result string is palloc'd, and should be freed by the caller.
+ * (This convention is not really needed anymore, since the relative path
+ * is fixed.)
   */
  char *
-group_getflatfilename(void)
+auth_getflatfilename(void)
  {
-       int                     bufsize;
-       char       *pfnam;
-
-       bufsize = strlen(DataDir) + strlen("/global/") +
-               strlen(GROUP_FLAT_FILE) + 1;
-       pfnam = (char *) palloc(bufsize);
-       snprintf(pfnam, bufsize, "%s/global/%s", DataDir, GROUP_FLAT_FILE);
-
-       return pfnam;
-}
-
-/*
- * Get full pathname of password file.
- *
- * Note that result string is palloc'd, and should be freed by the caller.
- */
-char *
-user_getflatfilename(void)
-{
-       int                     bufsize;
-       char       *pfnam;
-
-       bufsize = strlen(DataDir) + strlen("/global/") +
-               strlen(USER_FLAT_FILE) + 1;
-       pfnam = (char *) palloc(bufsize);
-       snprintf(pfnam, bufsize, "%s/global/%s", DataDir, USER_FLAT_FILE);
-
-       return pfnam;
+       return pstrdup(AUTH_FLAT_FILE);
  }
  
  
@@ -182,7 +147,7 @@ fputs_quote(const char *str, FILE *fp)
  /*
   * name_okay
   *
- * We must disallow newlines in user and group names because
+ * We must disallow newlines in role names because
   * hba.c's parser won't handle fields split across lines, even if quoted.
   */
  static bool
@@ -198,7 +163,7 @@ name_okay(const char *str)
  /*
   * write_database_file: update the flat database file
   *
- * A side effect is to determine the oldest database's datfrozenxid
+ * A side effect is to determine the oldest database's datminxid
   * so we can set or update the XID wrap limit.
   */
  static void
@@ -212,12 +177,12 @@ write_database_file(Relation drel)
         HeapScanDesc scan;
         HeapTuple       tuple;
         NameData        oldest_datname;
-       TransactionId oldest_datfrozenxid = InvalidTransactionId;
+       TransactionId oldest_datminxid = InvalidTransactionId;
  
         /*
          * Create a temporary filename to be renamed later.  This prevents the
-        * backend from clobbering the flat file while the postmaster
-        * might be reading from it.
+        * backend from clobbering the flat file while the postmaster might be
+        * reading from it.
          */
         filename = database_getflatfilename();
         bufsize = strlen(filename) + 12;
@@ -243,25 +208,27 @@ write_database_file(Relation drel)
                 char       *datname;
                 Oid                     datoid;
                 Oid                     dattablespace;
-               TransactionId datfrozenxid;
+               TransactionId datminxid,
+                                       datvacuumxid;
  
                 datname = NameStr(dbform->datname);
                 datoid = HeapTupleGetOid(tuple);
                 dattablespace = dbform->dattablespace;
-               datfrozenxid = dbform->datfrozenxid;
+               datminxid = dbform->datminxid;
+               datvacuumxid = dbform->datvacuumxid;
  
                 /*
-                * Identify the oldest datfrozenxid, ignoring databases that are not
-                * connectable (we assume they are safely frozen).  This must match
+                * Identify the oldest datminxid, ignoring databases that are not
+                * connectable (we assume they are safely frozen).      This must match
                  * the logic in vac_truncate_clog() in vacuum.c.
                  */
                 if (dbform->datallowconn &&
-                       TransactionIdIsNormal(datfrozenxid))
+                       TransactionIdIsNormal(datminxid))
                 {
-                       if (oldest_datfrozenxid == InvalidTransactionId ||
-                               TransactionIdPrecedes(datfrozenxid, oldest_datfrozenxid))
+                       if (oldest_datminxid == InvalidTransactionId ||
+                               TransactionIdPrecedes(datminxid, oldest_datminxid))
                         {
-                               oldest_datfrozenxid = datfrozenxid;
+                               oldest_datminxid = datminxid;
                                 namestrcpy(&oldest_datname, datname);
                         }
                 }
@@ -277,13 +244,14 @@ write_database_file(Relation drel)
                 }
  
                 /*
-                * The file format is: "dbname" oid tablespace frozenxid
+                * The file format is: "dbname" oid tablespace minxid vacuumxid
                  *
-                * The xid is not needed for backend startup, but may be of use
-                * for forensic purposes.
+                * The xids are not needed for backend startup, but are of use to
+                * autovacuum, and might also be helpful for forensic purposes.
                  */
                 fputs_quote(datname, fp);
-               fprintf(fp, " %u %u %u\n", datoid, dattablespace, datfrozenxid);
+               fprintf(fp, " %u %u %u %u\n",
+                               datoid, dattablespace, datminxid, datvacuumxid);
         }
         heap_endscan(scan);
  
@@ -294,8 +262,8 @@ write_database_file(Relation drel)
                                                 tempname)));
  
         /*
-        * Rename the temp file to its final name, deleting the old flat file.
-        * We expect that rename(2) is an atomic action.
+        * Rename the temp file to its final name, deleting the old flat file. We
+        * expect that rename(2) is an atomic action.
          */
         if (rename(tempname, filename))
                 ereport(ERROR,
@@ -303,177 +271,114 @@ write_database_file(Relation drel)
                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
                                                 tempname, filename)));
  
-       pfree(tempname);
-       pfree(filename);
-
         /*
-        * Set the transaction ID wrap limit using the oldest datfrozenxid
+        * Set the transaction ID wrap limit using the oldest datminxid
          */
-       if (oldest_datfrozenxid != InvalidTransactionId)
-               SetTransactionIdLimit(oldest_datfrozenxid, &oldest_datname);
+       if (oldest_datminxid != InvalidTransactionId)
+               SetTransactionIdLimit(oldest_datminxid, &oldest_datname);
  }
  
  
  /*
- * write_group_file: update the flat group file
+ * Support for write_auth_file
+ *
+ * The format for the flat auth file is
+ *             "rolename" "password" "validuntil" "memberof" "memberof" ...
+ * Only roles that are marked rolcanlogin are entered into the auth file.
+ * Each role's line lists all the roles (groups) of which it is directly
+ * or indirectly a member, except for itself.
+ *
+ * The postmaster expects the file to be sorted by rolename.  There is not
+ * any special ordering of the membership lists.
+ *
+ * To construct this information, we scan pg_authid and pg_auth_members,
+ * and build data structures in-memory before writing the file.
   */
-static void
-write_group_file(Relation grel)
-{
-       char       *filename,
-                          *tempname;
-       int                     bufsize;
-       FILE       *fp;
-       mode_t          oumask;
-       HeapScanDesc scan;
-       HeapTuple       tuple;
-
-       /*
-        * Create a temporary filename to be renamed later.  This prevents the
-        * backend from clobbering the flat file while the postmaster
-        * might be reading from it.
-        */
-       filename = group_getflatfilename();
-       bufsize = strlen(filename) + 12;
-       tempname = (char *) palloc(bufsize);
-       snprintf(tempname, bufsize, "%s.%d", filename, MyProcPid);
-
-       oumask = umask((mode_t) 077);
-       fp = AllocateFile(tempname, "w");
-       umask(oumask);
-       if (fp == NULL)
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not write to temporary file \"%s\": %m",
-                                               tempname)));
  
-       /*
-        * Read pg_group and write the file.
-        */
-       scan = heap_beginscan(grel, SnapshotNow, 0, NULL);
-       while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
-       {
-               Form_pg_group grpform = (Form_pg_group) GETSTRUCT(tuple);
-               HeapTupleHeader tup = tuple->t_data;
-               char       *tp;                         /* ptr to tuple data */
-               long            off;                    /* offset in tuple data */
-               bits8      *bp = tup->t_bits;   /* ptr to null bitmask in tuple */
-               Datum           datum;
-               char       *groname;
-               IdList     *grolist_p;
-               AclId      *aidp;
-               int                     i,
-                                       num;
-
-               groname = NameStr(grpform->groname);
-
-               /*
-                * Check for illegal characters in the group name.
-                */
-               if (!name_okay(groname))
-               {
-                       ereport(LOG,
-                                       (errmsg("invalid group name \"%s\"", groname)));
-                       continue;
-               }
-
-               /*
-                * We can't use heap_getattr() here because during startup we will
-                * not have any tupdesc for pg_group.  Fortunately it's not too
-                * hard to work around this.  grolist is the first possibly-null
-                * field so we can compute its offset directly.
-                */
-               tp = (char *) tup + tup->t_hoff;
-               off = offsetof(FormData_pg_group, grolist);
-
-               if (HeapTupleHasNulls(tuple) &&
-                       att_isnull(Anum_pg_group_grolist - 1, bp))
-               {
-                       /* grolist is null, so we can ignore this group */
-                       continue;
-               }
-
-               /* assume grolist is pass-by-ref */
-               datum = PointerGetDatum(tp + off);
-
-               /*
-                * We can't currently support out-of-line toasted group lists in
-                * startup mode (the tuptoaster won't work).  This sucks, but it
-                * should be something of a corner case.  Live with it until we
-                * can redesign pg_group.
-                *
-                * Detect startup mode by noting whether we got a tupdesc.
-                */
-               if (VARATT_IS_EXTERNAL(DatumGetPointer(datum)) &&
-                       RelationGetDescr(grel) == NULL)
-                       continue;
+typedef struct
+{
+       Oid                     roleid;
+       bool            rolcanlogin;
+       char       *rolname;
+       char       *rolpassword;
+       char       *rolvaliduntil;
+       List       *member_of;
+} auth_entry;
+
+typedef struct
+{
+       Oid                     roleid;
+       Oid                     memberid;
+} authmem_entry;
  
-               /* be sure the IdList is not toasted */
-               grolist_p = DatumGetIdListP(datum);
  
-               /*
-                * The file format is: "groupname"    usesysid1 usesysid2 ...
-                *
-                * We ignore groups that have no members.
-                */
-               aidp = IDLIST_DAT(grolist_p);
-               num = IDLIST_NUM(grolist_p);
-               if (num > 0)
-               {
-                       fputs_quote(groname, fp);
-                       fprintf(fp, "\t%u", aidp[0]);
-                       for (i = 1; i < num; ++i)
-                               fprintf(fp, " %u", aidp[i]);
-                       fputs("\n", fp);
-               }
-
-               /* if IdList was toasted, free detoasted copy */
-               if ((Pointer) grolist_p != DatumGetPointer(datum))
-                       pfree(grolist_p);
-       }
-       heap_endscan(scan);
+/* qsort comparator for sorting auth_entry array by roleid */
+static int
+oid_compar(const void *a, const void *b)
+{
+       const auth_entry *a_auth = (const auth_entry *) a;
+       const auth_entry *b_auth = (const auth_entry *) b;
+
+       if (a_auth->roleid < b_auth->roleid)
+               return -1;
+       if (a_auth->roleid > b_auth->roleid)
+               return 1;
+       return 0;
+}
  
-       if (FreeFile(fp))
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not write to temporary file \"%s\": %m",
-                                               tempname)));
+/* qsort comparator for sorting auth_entry array by rolname */
+static int
+name_compar(const void *a, const void *b)
+{
+       const auth_entry *a_auth = (const auth_entry *) a;
+       const auth_entry *b_auth = (const auth_entry *) b;
  
-       /*
-        * Rename the temp file to its final name, deleting the old flat file.
-        * We expect that rename(2) is an atomic action.
-        */
-       if (rename(tempname, filename))
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not rename file \"%s\" to \"%s\": %m",
-                                               tempname, filename)));
+       return strcmp(a_auth->rolname, b_auth->rolname);
+}
  
-       pfree(tempname);
-       pfree(filename);
+/* qsort comparator for sorting authmem_entry array by memberid */
+static int
+mem_compar(const void *a, const void *b)
+{
+       const authmem_entry *a_auth = (const authmem_entry *) a;
+       const authmem_entry *b_auth = (const authmem_entry *) b;
+
+       if (a_auth->memberid < b_auth->memberid)
+               return -1;
+       if (a_auth->memberid > b_auth->memberid)
+               return 1;
+       return 0;
  }
  
  
  /*
- * write_user_file: update the flat password file
+ * write_auth_file: update the flat auth file
   */
  static void
-write_user_file(Relation urel)
+write_auth_file(Relation rel_authid, Relation rel_authmem)
  {
         char       *filename,
                            *tempname;
         int                     bufsize;
+       BlockNumber totalblocks;
         FILE       *fp;
         mode_t          oumask;
         HeapScanDesc scan;
         HeapTuple       tuple;
+       int                     curr_role = 0;
+       int                     total_roles = 0;
+       int                     curr_mem = 0;
+       int                     total_mem = 0;
+       int                     est_rows;
+       auth_entry *auth_info;
+       authmem_entry *authmem_info;
  
         /*
          * Create a temporary filename to be renamed later.  This prevents the
-        * backend from clobbering the flat file while the postmaster might
-        * be reading from it.
+        * backend from clobbering the flat file while the postmaster might be
+        * reading from it.
          */
-       filename = user_getflatfilename();
+       filename = auth_getflatfilename();
         bufsize = strlen(filename) + 12;
         tempname = (char *) palloc(bufsize);
         snprintf(tempname, bufsize, "%s.%d", filename, MyProcPid);
@@ -488,39 +393,50 @@ write_user_file(Relation urel)
                                                 tempname)));
  
         /*
-        * Read pg_shadow and write the file.
+        * Read pg_authid and fill temporary data structures.  Note we must read
+        * all roles, even those without rolcanlogin.
          */
-       scan = heap_beginscan(urel, SnapshotNow, 0, NULL);
+       totalblocks = RelationGetNumberOfBlocks(rel_authid);
+       totalblocks = totalblocks ? totalblocks : 1;
+       est_rows = totalblocks * (BLCKSZ / (sizeof(HeapTupleHeaderData) + sizeof(FormData_pg_authid)));
+       auth_info = (auth_entry *) palloc(est_rows * sizeof(auth_entry));
+
+       scan = heap_beginscan(rel_authid, SnapshotNow, 0, NULL);
         while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
         {
-               Form_pg_shadow pwform = (Form_pg_shadow) GETSTRUCT(tuple);
+               Form_pg_authid aform = (Form_pg_authid) GETSTRUCT(tuple);
                 HeapTupleHeader tup = tuple->t_data;
-               char       *tp;                         /* ptr to tuple data */
-               long            off;                    /* offset in tuple data */
+               char       *tp;                 /* ptr to tuple data */
+               long            off;            /* offset in tuple data */
                 bits8      *bp = tup->t_bits;   /* ptr to null bitmask in tuple */
                 Datum           datum;
-               char       *usename,
-                                  *passwd,
-                                  *valuntil;
-               AclId           usesysid;
  
-               usename = NameStr(pwform->usename);
-               usesysid = pwform->usesysid;
+               if (curr_role >= est_rows)
+               {
+                       est_rows *= 2;
+                       auth_info = (auth_entry *)
+                               repalloc(auth_info, est_rows * sizeof(auth_entry));
+               }
+
+               auth_info[curr_role].roleid = HeapTupleGetOid(tuple);
+               auth_info[curr_role].rolcanlogin = aform->rolcanlogin;
+               auth_info[curr_role].rolname = pstrdup(NameStr(aform->rolname));
+               auth_info[curr_role].member_of = NIL;
  
                 /*
-                * We can't use heap_getattr() here because during startup we will
-                * not have any tupdesc for pg_shadow.  Fortunately it's not too
-                * hard to work around this.  passwd is the first possibly-null
-                * field so we can compute its offset directly.
+                * We can't use heap_getattr() here because during startup we will not
+                * have any tupdesc for pg_authid.      Fortunately it's not too hard to
+                * work around this.  rolpassword is the first possibly-null field so
+                * we can compute its offset directly.
                  */
                 tp = (char *) tup + tup->t_hoff;
-               off = offsetof(FormData_pg_shadow, passwd);
+               off = offsetof(FormData_pg_authid, rolpassword);
  
                 if (HeapTupleHasNulls(tuple) &&
-                       att_isnull(Anum_pg_shadow_passwd - 1, bp))
+                       att_isnull(Anum_pg_authid_rolpassword - 1, bp))
                 {
                         /* passwd is null, emit as an empty string */
-                       passwd = pstrdup("");
+                       auth_info[curr_role].rolpassword = pstrdup("");
                 }
                 else
                 {
@@ -528,63 +444,203 @@ write_user_file(Relation urel)
                         datum = PointerGetDatum(tp + off);
  
                         /*
-                        * The password probably shouldn't ever be out-of-line toasted;
-                        * if it is, ignore it, since we can't handle that in startup mode.
+                        * The password probably shouldn't ever be out-of-line toasted; if
+                        * it is, ignore it, since we can't handle that in startup mode.
                          */
                         if (VARATT_IS_EXTERNAL(DatumGetPointer(datum)))
-                               passwd = pstrdup("");
+                               auth_info[curr_role].rolpassword = pstrdup("");
                         else
-                               passwd = DatumGetCString(DirectFunctionCall1(textout, datum));
+                               auth_info[curr_role].rolpassword = DatumGetCString(DirectFunctionCall1(textout, datum));
  
                         /* assume passwd has attlen -1 */
                         off = att_addlength(off, -1, tp + off);
                 }
  
                 if (HeapTupleHasNulls(tuple) &&
-                       att_isnull(Anum_pg_shadow_valuntil - 1, bp))
+                       att_isnull(Anum_pg_authid_rolvaliduntil - 1, bp))
                 {
-                       /* valuntil is null, emit as an empty string */
-                       valuntil = pstrdup("");
+                       /* rolvaliduntil is null, emit as an empty string */
+                       auth_info[curr_role].rolvaliduntil = pstrdup("");
                 }
                 else
                 {
-                       /* assume valuntil has attalign 'i' */
-                       off = att_align(off, 'i');
-                       /* assume valuntil is pass-by-value, integer size */
-                       datum = Int32GetDatum(*((int32 *) (tp + off)));
-                       valuntil = DatumGetCString(DirectFunctionCall1(abstimeout, datum));
+                       /*
+                        * rolvaliduntil is timestamptz, which we assume is double
+                        * alignment and pass-by-reference.
+                        */
+                       off = att_align(off, 'd');
+                       datum = PointerGetDatum(tp + off);
+                       auth_info[curr_role].rolvaliduntil = DatumGetCString(DirectFunctionCall1(timestamptz_out, datum));
                 }
  
                 /*
                  * Check for illegal characters in the user name and password.
                  */
-               if (!name_okay(usename))
+               if (!name_okay(auth_info[curr_role].rolname))
                 {
                         ereport(LOG,
-                                       (errmsg("invalid user name \"%s\"", usename)));
+                                       (errmsg("invalid role name \"%s\"",
+                                                       auth_info[curr_role].rolname)));
                         continue;
                 }
-               if (!name_okay(passwd))
+               if (!name_okay(auth_info[curr_role].rolpassword))
                 {
                         ereport(LOG,
-                                       (errmsg("invalid user password \"%s\"", passwd)));
+                                       (errmsg("invalid role password \"%s\"",
+                                                       auth_info[curr_role].rolpassword)));
                         continue;
                 }
  
+               curr_role++;
+               total_roles++;
+       }
+       heap_endscan(scan);
+
+       /*
+        * Read pg_auth_members into temporary data structure, too
+        */
+       totalblocks = RelationGetNumberOfBlocks(rel_authmem);
+       totalblocks = totalblocks ? totalblocks : 1;
+       est_rows = totalblocks * (BLCKSZ / (sizeof(HeapTupleHeaderData) + sizeof(FormData_pg_auth_members)));
+       authmem_info = (authmem_entry *) palloc(est_rows * sizeof(authmem_entry));
+
+       scan = heap_beginscan(rel_authmem, SnapshotNow, 0, NULL);
+       while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+       {
+               Form_pg_auth_members memform = (Form_pg_auth_members) GETSTRUCT(tuple);
+
+               if (curr_mem >= est_rows)
+               {
+                       est_rows *= 2;
+                       authmem_info = (authmem_entry *)
+                               repalloc(authmem_info, est_rows * sizeof(authmem_entry));
+               }
+
+               authmem_info[curr_mem].roleid = memform->roleid;
+               authmem_info[curr_mem].memberid = memform->member;
+               curr_mem++;
+               total_mem++;
+       }
+       heap_endscan(scan);
+
+       /*
+        * Search for memberships.      We can skip all this if pg_auth_members is
+        * empty.
+        */
+       if (total_mem > 0)
+       {
                 /*
-                * The file format is: "usename" usesysid "passwd" "valuntil"
+                * Sort auth_info by roleid and authmem_info by memberid.
                  */
-               fputs_quote(usename, fp);
-               fprintf(fp, " %u ", usesysid);
-               fputs_quote(passwd, fp);
-               fputs(" ", fp);
-               fputs_quote(valuntil, fp);
-               fputs("\n", fp);
-
-               pfree(passwd);
-               pfree(valuntil);
+               qsort(auth_info, total_roles, sizeof(auth_entry), oid_compar);
+               qsort(authmem_info, total_mem, sizeof(authmem_entry), mem_compar);
+
+               /*
+                * For each role, find what it belongs to.
+                */
+               for (curr_role = 0; curr_role < total_roles; curr_role++)
+               {
+                       List       *roles_list;
+                       List       *roles_names_list = NIL;
+                       ListCell   *mem;
+
+                       /* We can skip this for non-login roles */
+                       if (!auth_info[curr_role].rolcanlogin)
+                               continue;
+
+                       /*
+                        * This search algorithm is the same as in is_member_of_role; we
+                        * are just working with a different input data structure.
+                        */
+                       roles_list = list_make1_oid(auth_info[curr_role].roleid);
+
+                       foreach(mem, roles_list)
+                       {
+                               authmem_entry key;
+                               authmem_entry *found_mem;
+                               int                     first_found,
+                                                       last_found,
+                                                       i;
+
+                               key.memberid = lfirst_oid(mem);
+                               found_mem = bsearch(&key, authmem_info, total_mem,
+                                                                       sizeof(authmem_entry), mem_compar);
+                               if (!found_mem)
+                                       continue;
+
+                               /*
+                                * bsearch found a match for us; but if there were multiple
+                                * matches it could have found any one of them. Locate first
+                                * and last match.
+                                */
+                               first_found = last_found = (found_mem - authmem_info);
+                               while (first_found > 0 &&
+                                          mem_compar(&key, &authmem_info[first_found - 1]) == 0)
+                                       first_found--;
+                               while (last_found + 1 < total_mem &&
+                                          mem_compar(&key, &authmem_info[last_found + 1]) == 0)
+                                       last_found++;
+
+                               /*
+                                * Now add all the new roles to roles_list.
+                                */
+                               for (i = first_found; i <= last_found; i++)
+                                       roles_list = list_append_unique_oid(roles_list,
+                                                                                                        authmem_info[i].roleid);
+                       }
+
+                       /*
+                        * Convert list of role Oids to list of role names. We must do
+                        * this before re-sorting auth_info.
+                        *
+                        * We skip the first list element (curr_role itself) since there
+                        * is no point in writing that a role is a member of itself.
+                        */
+                       for_each_cell(mem, lnext(list_head(roles_list)))
+                       {
+                               auth_entry      key_auth;
+                               auth_entry *found_role;
+
+                               key_auth.roleid = lfirst_oid(mem);
+                               found_role = bsearch(&key_auth, auth_info, total_roles,
+                                                                        sizeof(auth_entry), oid_compar);
+                               if (found_role) /* paranoia */
+                                       roles_names_list = lappend(roles_names_list,
+                                                                                          found_role->rolname);
+                       }
+                       auth_info[curr_role].member_of = roles_names_list;
+                       list_free(roles_list);
+               }
+       }
+
+       /*
+        * Now sort auth_info into rolname order for output, and write the file.
+        */
+       qsort(auth_info, total_roles, sizeof(auth_entry), name_compar);
+
+       for (curr_role = 0; curr_role < total_roles; curr_role++)
+       {
+               auth_entry *arole = &auth_info[curr_role];
+
+               if (arole->rolcanlogin)
+               {
+                       ListCell   *mem;
+
+                       fputs_quote(arole->rolname, fp);
+                       fputs(" ", fp);
+                       fputs_quote(arole->rolpassword, fp);
+                       fputs(" ", fp);
+                       fputs_quote(arole->rolvaliduntil, fp);
+
+                       foreach(mem, arole->member_of)
+                       {
+                               fputs(" ", fp);
+                               fputs_quote((char *) lfirst(mem), fp);
+                       }
+
+                       fputs("\n", fp);
+               }
         }
-       heap_endscan(scan);
  
         if (FreeFile(fp))
                 ereport(ERROR,
@@ -593,17 +649,14 @@ write_user_file(Relation urel)
                                                 tempname)));
  
         /*
-        * Rename the temp file to its final name, deleting the old flat file.
-        * We expect that rename(2) is an atomic action.
+        * Rename the temp file to its final name, deleting the old flat file. We
+        * expect that rename(2) is an atomic action.
          */
         if (rename(tempname, filename))
                 ereport(ERROR,
                                 (errcode_for_file_access(),
                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
                                                 tempname, filename)));
-
-       pfree(tempname);
-       pfree(filename);
  }
  
  
@@ -615,23 +668,27 @@ write_user_file(Relation urel)
   * base backup which may be far out of sync with the current state.
   *
   * In theory we could skip rebuilding the flat files if no WAL replay
- * occurred, but it seems safest to just do it always.  We have to
- * scan pg_database to compute the XID wrap limit anyway.
+ * occurred, but it seems best to just do it always.  We have to
+ * scan pg_database to compute the XID wrap limit anyway.  Also, this
+ * policy means we need not force initdb to change the format of the
+ * flat files.
   *
   * In a standalone backend we pass database_only = true to skip processing
- * the user and group files.  We won't need them, and building them could
- * fail if there's something corrupt in those catalogs.
+ * the auth file.  We won't need it, and building it could fail if there's
+ * something corrupt in the authid/authmem catalogs.
   */
  void
  BuildFlatFiles(bool database_only)
  {
         ResourceOwner owner;
         RelFileNode rnode;
-       Relation        rel;
+       Relation        rel_db,
+                               rel_authid,
+                               rel_authmem;
  
         /*
-        * We don't have any hope of running a real relcache, but we can use
-        * the same fake-relcache facility that WAL replay uses.
+        * We don't have any hope of running a real relcache, but we can use the
+        * same fake-relcache facility that WAL replay uses.
          */
         XLogInitRelationCache();
  
@@ -645,26 +702,24 @@ BuildFlatFiles(bool database_only)
         rnode.relNode = DatabaseRelationId;
  
         /* No locking is needed because no one else is alive yet */
-       rel = XLogOpenRelation(true, 0, rnode);
-       write_database_file(rel);
+       rel_db = XLogOpenRelation(rnode);
+       write_database_file(rel_db);
  
         if (!database_only)
         {
-               /* hard-wired path to pg_group */
+               /* hard-wired path to pg_authid */
                 rnode.spcNode = GLOBALTABLESPACE_OID;
                 rnode.dbNode = 0;
-               rnode.relNode = GroupRelationId;
+               rnode.relNode = AuthIdRelationId;
+               rel_authid = XLogOpenRelation(rnode);
  
-               rel = XLogOpenRelation(true, 0, rnode);
-               write_group_file(rel);
-
-               /* hard-wired path to pg_shadow */
+               /* hard-wired path to pg_auth_members */
                 rnode.spcNode = GLOBALTABLESPACE_OID;
                 rnode.dbNode = 0;
-               rnode.relNode = ShadowRelationId;
+               rnode.relNode = AuthMemRelationId;
+               rel_authmem = XLogOpenRelation(rnode);
  
-               rel = XLogOpenRelation(true, 0, rnode);
-               write_user_file(rel);
+               write_auth_file(rel_authid, rel_authmem);
         }
  
         CurrentResourceOwner = NULL;
@@ -692,42 +747,69 @@ void
  AtEOXact_UpdateFlatFiles(bool isCommit)
  {
         Relation        drel = NULL;
-       Relation        grel = NULL;
-       Relation        urel = NULL;
+       Relation        arel = NULL;
+       Relation        mrel = NULL;
  
         if (database_file_update_subid == InvalidSubTransactionId &&
-               group_file_update_subid == InvalidSubTransactionId &&
-               user_file_update_subid == InvalidSubTransactionId)
+               auth_file_update_subid == InvalidSubTransactionId)
                 return;                                 /* nothing to do */
  
         if (!isCommit)
         {
                 database_file_update_subid = InvalidSubTransactionId;
-               group_file_update_subid = InvalidSubTransactionId;
-               user_file_update_subid = InvalidSubTransactionId;
+               auth_file_update_subid = InvalidSubTransactionId;
                 return;
         }
  
         /*
-        * Advance command counter to be certain we see all effects of the
-        * current transaction.
+        * Advance command counter to be certain we see all effects of the current
+        * transaction.
          */
         CommandCounterIncrement();
  
         /*
-        * We use ExclusiveLock to ensure that only one backend writes the
-        * flat file(s) at a time.      That's sufficient because it's okay to
-        * allow plain reads of the tables in parallel.  There is some chance
-        * of a deadlock here (if we were triggered by a user update of one
-        * of the tables, which likely won't have gotten a strong enough lock),
-        * so get the locks we need before writing anything.
+        * Open and lock the needed catalog(s).
+        *
+        * Even though we only need AccessShareLock, this could theoretically fail
+        * due to deadlock.  In practice, however, our transaction already holds
+        * RowExclusiveLock or better (it couldn't have updated the catalog
+        * without such a lock).  This implies that dbcommands.c and other places
+        * that force flat-file updates must not follow the common practice of
+        * dropping catalog locks before commit.
          */
         if (database_file_update_subid != InvalidSubTransactionId)
-               drel = heap_open(DatabaseRelationId, ExclusiveLock);
-       if (group_file_update_subid != InvalidSubTransactionId)
-               grel = heap_open(GroupRelationId, ExclusiveLock);
-       if (user_file_update_subid != InvalidSubTransactionId)
-               urel = heap_open(ShadowRelationId, ExclusiveLock);
+               drel = heap_open(DatabaseRelationId, AccessShareLock);
+
+       if (auth_file_update_subid != InvalidSubTransactionId)
+       {
+               arel = heap_open(AuthIdRelationId, AccessShareLock);
+               mrel = heap_open(AuthMemRelationId, AccessShareLock);
+       }
+
+       /*
+        * Obtain special locks to ensure that two transactions don't try to write
+        * the same flat file concurrently.  Quite aside from any direct risks of
+        * corrupted output, the winning writer probably wouldn't have seen the
+        * other writer's updates.  By taking a lock and holding it till commit,
+        * we ensure that whichever updater goes second will see the other
+        * updater's changes as committed, and thus the final state of the file
+        * will include all updates.
+        *
+        * We use a lock on "database 0" to protect writing the pg_database flat
+        * file, and a lock on "role 0" to protect the auth file.  This is a bit
+        * ugly but it's not worth inventing any more-general convention.  (Any
+        * two locktags that are never used for anything else would do.)
+        *
+        * This is safe against deadlock as long as these are the very last locks
+        * acquired during the transaction.
+        */
+       if (database_file_update_subid != InvalidSubTransactionId)
+               LockSharedObject(DatabaseRelationId, InvalidOid, 0,
+                                                AccessExclusiveLock);
+
+       if (auth_file_update_subid != InvalidSubTransactionId)
+               LockSharedObject(AuthIdRelationId, InvalidOid, 0,
+                                                AccessExclusiveLock);
  
         /* Okay to write the files */
         if (database_file_update_subid != InvalidSubTransactionId)
@@ -737,18 +819,12 @@ AtEOXact_UpdateFlatFiles(bool isCommit)
                 heap_close(drel, NoLock);
         }
  
-       if (group_file_update_subid != InvalidSubTransactionId)
+       if (auth_file_update_subid != InvalidSubTransactionId)
         {
-               group_file_update_subid = InvalidSubTransactionId;
-               write_group_file(grel);
-               heap_close(grel, NoLock);
-       }
-
-       if (user_file_update_subid != InvalidSubTransactionId)
-       {
-               user_file_update_subid = InvalidSubTransactionId;
-               write_user_file(urel);
-               heap_close(urel, NoLock);
+               auth_file_update_subid = InvalidSubTransactionId;
+               write_auth_file(arel, mrel);
+               heap_close(arel, NoLock);
+               heap_close(mrel, NoLock);
         }
  
         /*
@@ -757,6 +833,38 @@ AtEOXact_UpdateFlatFiles(bool isCommit)
         SendPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE);
  }
  
+
+/*
+ * This routine is called during transaction prepare.
+ *
+ * Record which files need to be refreshed if this transaction later
+ * commits.
+ *
+ * Note: it's OK to clear the flags immediately, since if the PREPARE fails
+ * further on, we'd only reset the flags anyway. So there's no need for a
+ * separate PostPrepare call.
+ */
+void
+AtPrepare_UpdateFlatFiles(void)
+{
+       uint16          info = 0;
+
+       if (database_file_update_subid != InvalidSubTransactionId)
+       {
+               database_file_update_subid = InvalidSubTransactionId;
+               info |= FF_BIT_DATABASE;
+       }
+       if (auth_file_update_subid != InvalidSubTransactionId)
+       {
+               auth_file_update_subid = InvalidSubTransactionId;
+               info |= FF_BIT_AUTH;
+       }
+       if (info != 0)
+               RegisterTwoPhaseRecord(TWOPHASE_RM_FLATFILES_ID, info,
+                                                          NULL, 0);
+}
+
+
  /*
   * AtEOSubXact_UpdateFlatFiles
   *
@@ -773,32 +881,26 @@ AtEOSubXact_UpdateFlatFiles(bool isCommit,
                 if (database_file_update_subid == mySubid)
                         database_file_update_subid = parentSubid;
  
-               if (group_file_update_subid == mySubid)
-                       group_file_update_subid = parentSubid;
-
-               if (user_file_update_subid == mySubid)
-                       user_file_update_subid = parentSubid;
+               if (auth_file_update_subid == mySubid)
+                       auth_file_update_subid = parentSubid;
         }
         else
         {
                 if (database_file_update_subid == mySubid)
                         database_file_update_subid = InvalidSubTransactionId;
  
-               if (group_file_update_subid == mySubid)
-                       group_file_update_subid = InvalidSubTransactionId;
-
-               if (user_file_update_subid == mySubid)
-                       user_file_update_subid = InvalidSubTransactionId;
+               if (auth_file_update_subid == mySubid)
+                       auth_file_update_subid = InvalidSubTransactionId;
         }
  }
  
  
  /*
- * This trigger is fired whenever someone modifies pg_database, pg_shadow
- * or pg_group via general-purpose INSERT/UPDATE/DELETE commands.
+ * This trigger is fired whenever someone modifies pg_database, pg_authid
+ * or pg_auth_members via general-purpose INSERT/UPDATE/DELETE commands.
   *
   * It is sufficient for this to be a STATEMENT trigger since we don't
- * care which individual rows changed.  It doesn't much matter whether
+ * care which individual rows changed. It doesn't much matter whether
   * it's a BEFORE or AFTER trigger.
   */
  Datum
@@ -818,11 +920,9 @@ flatfile_update_trigger(PG_FUNCTION_ARGS)
                 case DatabaseRelationId:
                         database_file_update_needed();
                         break;
-               case GroupRelationId:
-                       group_file_update_needed();
-                       break;
-               case ShadowRelationId:
-                       user_file_update_needed();
+               case AuthIdRelationId:
+               case AuthMemRelationId:
+                       auth_file_update_needed();
                         break;
                 default:
                         elog(ERROR, "flatfile_update_trigger was called for wrong table");
@@ -831,3 +931,26 @@ flatfile_update_trigger(PG_FUNCTION_ARGS)
  
         return PointerGetDatum(NULL);
  }
+
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * (We don't have to do anything for ROLLBACK PREPARED.)
+ */
+void
+flatfile_twophase_postcommit(TransactionId xid, uint16 info,
+                                                        void *recdata, uint32 len)
+{
+       /*
+        * Set flags to do the needed file updates at the end of my own current
+        * transaction.  (XXX this has some issues if my own transaction later
+        * rolls back, or if there is any significant delay before I commit.  OK
+        * for now because we disallow COMMIT PREPARED inside a transaction
+        * block.)
+        */
+       if (info & FF_BIT_DATABASE)
+               database_file_update_needed();
+       if (info & FF_BIT_AUTH)
+               auth_file_update_needed();
+}