Fix up pgstats counting of live and dead tuples to recognize that committed

[postgresql] / src / backend / postmaster / pgstat.c
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c

index 5616d0b3cd1911b3f65e5d98c97d98218ecc230f..b41a16de44ce86435068597a40e0fa3537ccd08b 100644 (file)
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -11,9 +11,9 @@
   *                     - Add a pgstat config column to pg_database, so this
   *                       entire thing can be enabled/disabled on a per db basis.
   *
- *     Copyright (c) 2001-2003, PostgreSQL Global Development Group
+ *     Copyright (c) 2001-2007, PostgreSQL Global Development Group
   *
- *     $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.76 2004/06/26 16:32:02 tgl Exp $
+ *     $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.156 2007/05/27 03:50:39 tgl Exp $
   * ----------
   */
  #include "postgres.h"
@@ -28,69 +28,64 @@
  #include <arpa/inet.h>
  #include <signal.h>
  #include <time.h>
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
+#ifdef HAVE_SYS_POLL_H
+#include <sys/poll.h>
+#endif
  
  #include "pgstat.h"
  
  #include "access/heapam.h"
+#include "access/transam.h"
+#include "access/twophase_rmgr.h"
  #include "access/xact.h"
-#include "catalog/catname.h"
  #include "catalog/pg_database.h"
-#include "catalog/pg_shadow.h"
+#include "libpq/ip.h"
  #include "libpq/libpq.h"
  #include "libpq/pqsignal.h"
  #include "mb/pg_wchar.h"
  #include "miscadmin.h"
+#include "postmaster/autovacuum.h"
+#include "postmaster/fork_process.h"
  #include "postmaster/postmaster.h"
  #include "storage/backendid.h"
+#include "storage/fd.h"
  #include "storage/ipc.h"
  #include "storage/pg_shmem.h"
  #include "storage/pmsignal.h"
-#include "tcop/tcopprot.h"
-#include "utils/hsearch.h"
  #include "utils/memutils.h"
  #include "utils/ps_status.h"
-#include "utils/rel.h"
-#include "utils/syscache.h"
  
  
  /* ----------
- * Paths for the statistics files. The %s is replaced with the
- * installation's $PGDATA.
+ * Paths for the statistics files (relative to installation's $PGDATA).
   * ----------
   */
-#define PGSTAT_STAT_FILENAME   "%s/global/pgstat.stat"
-#define PGSTAT_STAT_TMPFILE            "%s/global/pgstat.tmp.%d"
+#define PGSTAT_STAT_FILENAME   "global/pgstat.stat"
+#define PGSTAT_STAT_TMPFILE            "global/pgstat.tmp"
  
  /* ----------
   * Timer definitions.
   * ----------
   */
-#define PGSTAT_STAT_INTERVAL   500             /* How often to write the status
-                                                                                * file; in milliseconds. */
-
-#define PGSTAT_DESTROY_DELAY   10000   /* How long to keep destroyed
-                                                                                * objects known, to give delayed
-                                                                                * UDP packets time to arrive;
+#define PGSTAT_STAT_INTERVAL   500             /* How often to write the status file;
                                                                                  * in milliseconds. */
  
-#define PGSTAT_DESTROY_COUNT   (PGSTAT_DESTROY_DELAY / PGSTAT_STAT_INTERVAL)
+#define PGSTAT_RESTART_INTERVAL 60             /* How often to attempt to restart a
+                                                                                * failed statistics collector; in
+                                                                                * seconds. */
  
-#define PGSTAT_RESTART_INTERVAL 60             /* How often to attempt to restart
-                                                                                * a failed statistics collector;
-                                                                                * in seconds. */
+#define PGSTAT_SELECT_TIMEOUT  2               /* How often to check for postmaster
+                                                                                * death; in seconds. */
  
-/* ----------
- * Amount of space reserved in pgstat_recvbuffer().
- * ----------
- */
-#define PGSTAT_RECVBUFFERSZ            ((int) (1024 * sizeof(PgStat_Msg)))
  
  /* ----------
   * The initial size hints for the hash tables used in the collector.
   * ----------
   */
  #define PGSTAT_DB_HASH_SIZE            16
-#define PGSTAT_BE_HASH_SIZE            512
  #define PGSTAT_TAB_HASH_SIZE   512
  
  
@@ -99,42 +94,97 @@
   * ----------
   */
  bool           pgstat_collect_startcollector = true;
-bool           pgstat_collect_resetonpmstart = true;
-bool           pgstat_collect_querystring = false;
+bool           pgstat_collect_resetonpmstart = false;
  bool           pgstat_collect_tuplelevel = false;
  bool           pgstat_collect_blocklevel = false;
+bool           pgstat_collect_querystring = false;
+
+/*
+ * BgWriter global statistics counters (unused in other processes).
+ * Stored directly in a stats message structure so it can be sent
+ * without needing to copy things around.  We assume this inits to zeroes.
+ */
+PgStat_MsgBgWriter BgWriterStats;
  
  /* ----------
   * Local data
   * ----------
   */
-NON_EXEC_STATIC int    pgStatSock = -1;
-static int     pgStatPipe[2];
+NON_EXEC_STATIC int pgStatSock = -1;
+
  static struct sockaddr_storage pgStatAddr;
  
  static time_t last_pgstat_start_time;
  
-static long pgStatNumMessages = 0;
+static bool pgStatRunningInCollector = false;
+
+/*
+ * Structures in which backends store per-table info that's waiting to be
+ * sent to the collector.
+ *
+ * NOTE: once allocated, TabStatusArray structures are never moved or deleted
+ * for the life of the backend.  Also, we zero out the t_id fields of the
+ * contained PgStat_TableStatus structs whenever they are not actively in use.
+ * This allows relcache pgstat_info pointers to be treated as long-lived data,
+ * avoiding repeated searches in pgstat_initstats() when a relation is
+ * repeatedly opened during a transaction.
+ */
+#define TABSTAT_QUANTUM                100                     /* we alloc this many at a time */
+
+typedef struct TabStatusArray
+{
+       struct TabStatusArray *tsa_next;        /* link to next array, if any */
+       int                     tsa_used;                               /* # entries currently used */
+       PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM];        /* per-table data */
+} TabStatusArray;
  
-static bool pgStatRunningInCollector = FALSE;
+static TabStatusArray *pgStatTabList = NULL;
  
-static int     pgStatTabstatAlloc = 0;
-static int     pgStatTabstatUsed = 0;
-static PgStat_MsgTabstat **pgStatTabstatMessages = NULL;
+/*
+ * Tuple insertion/deletion counts for an open transaction can't be propagated
+ * into PgStat_TableStatus counters until we know if it is going to commit
+ * or abort.  Hence, we keep these counts in per-subxact structs that live
+ * in TopTransactionContext.  This data structure is designed on the assumption
+ * that subxacts won't usually modify very many tables.
+ */
+typedef struct PgStat_SubXactStatus
+{
+       int                     nest_level;                             /* subtransaction nest level */
+       struct PgStat_SubXactStatus *prev;      /* higher-level subxact if any */
+       PgStat_TableXactStatus *first;          /* head of list for this subxact */
+} PgStat_SubXactStatus;
  
-#define TABSTAT_QUANTUM                4       /* we alloc this many at a time */
+static PgStat_SubXactStatus *pgStatXactStack = NULL;
  
  static int     pgStatXactCommit = 0;
  static int     pgStatXactRollback = 0;
  
-static TransactionId pgStatDBHashXact = InvalidTransactionId;
+/* Record that's written to 2PC state file when pgstat state is persisted */
+typedef struct TwoPhasePgStatRecord
+{
+       PgStat_Counter tuples_inserted; /* tuples inserted in xact */
+       PgStat_Counter tuples_deleted;  /* tuples deleted in xact */
+       Oid                     t_id;                           /* table's OID */
+       bool            t_shared;                       /* is it a shared catalog? */
+} TwoPhasePgStatRecord;
+
+/*
+ * Info about current "snapshot" of stats file
+ */
+static MemoryContext pgStatLocalContext = NULL;
  static HTAB *pgStatDBHash = NULL;
-static HTAB *pgStatBeDead = NULL;
-static PgStat_StatBeEntry *pgStatBeTable = NULL;
-static int     pgStatNumBackends = 0;
+static PgBackendStatus *localBackendStatusTable = NULL;
+static int     localNumBackends = 0;
+
+/*
+ * Cluster wide statistics, kept in the stats collector.
+ * Contains statistics that are not collected per database
+ * or per table.
+ */
+static PgStat_GlobalStats globalStats;
  
-static char pgStat_fname[MAXPGPATH];
-static char pgStat_tmpfname[MAXPGPATH];
+static volatile bool need_exit = false;
+static volatile bool need_statwrite = false;
  
  
  /* ----------
@@ -142,42 +192,38 @@ static char pgStat_tmpfname[MAXPGPATH];
   * ----------
   */
  #ifdef EXEC_BACKEND
-
-typedef enum STATS_PROCESS_TYPE
-{
-       STAT_PROC_BUFFER,
-       STAT_PROC_COLLECTOR
-} STATS_PROCESS_TYPE;
-
-static pid_t pgstat_forkexec(STATS_PROCESS_TYPE procType);
-static void pgstat_parseArgs(int argc, char *argv[]);
-
+static pid_t pgstat_forkexec(void);
  #endif
  
-NON_EXEC_STATIC void PgstatBufferMain(int argc, char *argv[]);
  NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]);
-static void pgstat_recvbuffer(void);
  static void pgstat_exit(SIGNAL_ARGS);
-static void pgstat_die(SIGNAL_ARGS);
+static void force_statwrite(SIGNAL_ARGS);
+static void pgstat_beshutdown_hook(int code, Datum arg);
  
-static int     pgstat_add_backend(PgStat_MsgHdr *msg);
-static void pgstat_sub_backend(int procpid);
-static void pgstat_drop_database(Oid databaseid);
+static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
  static void pgstat_write_statsfile(void);
-static void pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
-                                         PgStat_StatBeEntry **betab,
-                                         int *numbackends);
+static HTAB *pgstat_read_statsfile(Oid onlydb);
+static void backend_read_statsfile(void);
+static void pgstat_read_current_status(void);
  
-static void pgstat_setheader(PgStat_MsgHdr *hdr, int mtype);
+static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
+static HTAB *pgstat_collect_oids(Oid catalogid);
+
+static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
+
+static void pgstat_setup_memcxt(void);
+
+static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
  static void pgstat_send(void *msg, int len);
  
-static void pgstat_recv_bestart(PgStat_MsgBestart *msg, int len);
-static void pgstat_recv_beterm(PgStat_MsgBeterm *msg, int len);
-static void pgstat_recv_activity(PgStat_MsgActivity *msg, int len);
  static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
  static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
  static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
  static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
+static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
+static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
+static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
+static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
  
  
  /* ------------------------------------------------------------
@@ -202,35 +248,29 @@ pgstat_init(void)
                            *addr,
                                 hints;
         int                     ret;
-       fd_set      rset;
+       fd_set          rset;
         struct timeval tv;
-       char        test_byte;
-       int         sel_res;
+       char            test_byte;
+       int                     sel_res;
+       int                     tries = 0;
  
  #define TESTBYTEVAL ((char) 199)
  
         /*
-        * Force start of collector daemon if something to collect
+        * Force start of collector daemon if something to collect.  Note that
+        * pgstat_collect_querystring is now an independent facility that does not
+        * require the collector daemon.
          */
-       if (pgstat_collect_querystring ||
-               pgstat_collect_tuplelevel ||
+       if (pgstat_collect_tuplelevel ||
                 pgstat_collect_blocklevel)
                 pgstat_collect_startcollector = true;
  
-       /*
-        * Initialize the filename for the status reports.  (In the EXEC_BACKEND
-        * case, this only sets the value in the postmaster.  The collector
-        * subprocess will recompute the value for itself, and individual
-        * backends must do so also if they want to access the file.)
-        */
-       snprintf(pgStat_fname, MAXPGPATH, PGSTAT_STAT_FILENAME, DataDir);
-
         /*
          * If we don't have to start a collector or should reset the collected
-        * statistics on postmaster start, simply remove the file.
+        * statistics on postmaster start, simply remove the stats file.
          */
         if (!pgstat_collect_startcollector || pgstat_collect_resetonpmstart)
-               unlink(pgStat_fname);
+               pgstat_reset_all();
  
         /*
          * Nothing else required if collector will not get started
@@ -249,7 +289,7 @@ pgstat_init(void)
         hints.ai_addr = NULL;
         hints.ai_canonname = NULL;
         hints.ai_next = NULL;
-       ret = getaddrinfo_all("localhost", NULL, &hints, &addrs);
+       ret = pg_getaddrinfo_all("localhost", NULL, &hints, &addrs);
         if (ret || !addrs)
         {
                 ereport(LOG,
@@ -259,11 +299,11 @@ pgstat_init(void)
         }
  
         /*
-        * On some platforms, getaddrinfo_all() may return multiple addresses
+        * On some platforms, pg_getaddrinfo_all() may return multiple addresses
          * only one of which will actually work (eg, both IPv6 and IPv4 addresses
          * when kernel will reject IPv6).  Worse, the failure may occur at the
-        * bind() or perhaps even connect() stage.  So we must loop through the
-        * results till we find a working combination.  We will generate LOG
+        * bind() or perhaps even connect() stage.      So we must loop through the
+        * results till we find a working combination. We will generate LOG
          * messages, but no error, for bogus combinations.
          */
         for (addr = addrs; addr; addr = addr->ai_next)
@@ -273,6 +313,11 @@ pgstat_init(void)
                 if (addr->ai_family == AF_UNIX)
                         continue;
  #endif
+
+               if (++tries > 1)
+                       ereport(LOG,
+                       (errmsg("trying another address for the statistics collector")));
+
                 /*
                  * Create the socket.
                  */
@@ -280,7 +325,7 @@ pgstat_init(void)
                 {
                         ereport(LOG,
                                         (errcode_for_socket_access(),
-                                        errmsg("could not create socket for statistics collector: %m")));
+                       errmsg("could not create socket for statistics collector: %m")));
                         continue;
                 }
  
@@ -292,14 +337,14 @@ pgstat_init(void)
                 {
                         ereport(LOG,
                                         (errcode_for_socket_access(),
-                                        errmsg("could not bind socket for statistics collector: %m")));
+                         errmsg("could not bind socket for statistics collector: %m")));
                         closesocket(pgStatSock);
                         pgStatSock = -1;
                         continue;
                 }
  
                 alen = sizeof(pgStatAddr);
-               if (getsockname(pgStatSock, (struct sockaddr *) &pgStatAddr, &alen) < 0)
+               if (getsockname(pgStatSock, (struct sockaddr *) & pgStatAddr, &alen) < 0)
                 {
                         ereport(LOG,
                                         (errcode_for_socket_access(),
@@ -315,25 +360,29 @@ pgstat_init(void)
                  * provides a kernel-level check that only packets from this same
                  * address will be received.
                  */
-               if (connect(pgStatSock, (struct sockaddr *) &pgStatAddr, alen) < 0)
+               if (connect(pgStatSock, (struct sockaddr *) & pgStatAddr, alen) < 0)
                 {
                         ereport(LOG,
                                         (errcode_for_socket_access(),
-                                        errmsg("could not connect socket for statistics collector: %m")));
+                       errmsg("could not connect socket for statistics collector: %m")));
                         closesocket(pgStatSock);
                         pgStatSock = -1;
                         continue;
                 }
  
                 /*
-                * Try to send and receive a one-byte test message on the socket.
-                * This is to catch situations where the socket can be created but
-                * will not actually pass data (for instance, because kernel packet
-                * filtering rules prevent it).
+                * Try to send and receive a one-byte test message on the socket. This
+                * is to catch situations where the socket can be created but will not
+                * actually pass data (for instance, because kernel packet filtering
+                * rules prevent it).
                  */
                 test_byte = TESTBYTEVAL;
+
+retry1:
                 if (send(pgStatSock, &test_byte, 1, 0) != 1)
                 {
+                       if (errno == EINTR)
+                               goto retry1;    /* if interrupted, just retry */
                         ereport(LOG,
                                         (errcode_for_socket_access(),
                                          errmsg("could not send test message on socket for statistics collector: %m")));
@@ -353,7 +402,7 @@ pgstat_init(void)
                         FD_SET(pgStatSock, &rset);
                         tv.tv_sec = 0;
                         tv.tv_usec = 500000;
-                       sel_res = select(pgStatSock+1, &rset, NULL, NULL, &tv);
+                       sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv);
                         if (sel_res >= 0 || errno != EINTR)
                                 break;
                 }
@@ -375,7 +424,7 @@ pgstat_init(void)
                          * errno will not be set meaningfully here, so don't use it.
                          */
                         ereport(LOG,
-                                       (ERRCODE_CONNECTION_FAILURE,
+                                       (errcode(ERRCODE_CONNECTION_FAILURE),
                                          errmsg("test message did not get through on socket for statistics collector")));
                         closesocket(pgStatSock);
                         pgStatSock = -1;
@@ -384,8 +433,11 @@ pgstat_init(void)
  
                 test_byte++;                    /* just make sure variable is changed */
  
+retry2:
                 if (recv(pgStatSock, &test_byte, 1, 0) != 1)
                 {
+                       if (errno == EINTR)
+                               goto retry2;    /* if interrupted, just retry */
                         ereport(LOG,
                                         (errcode_for_socket_access(),
                                          errmsg("could not receive test message on socket for statistics collector: %m")));
@@ -394,10 +446,10 @@ pgstat_init(void)
                         continue;
                 }
  
-               if (test_byte != TESTBYTEVAL) /* strictly paranoia ... */
+               if (test_byte != TESTBYTEVAL)   /* strictly paranoia ... */
                 {
                         ereport(LOG,
-                                       (ERRCODE_INTERNAL_ERROR,
+                                       (errcode(ERRCODE_INTERNAL_ERROR),
                                          errmsg("incorrect test message transmission on socket for statistics collector")));
                         closesocket(pgStatSock);
                         pgStatSock = -1;
@@ -410,34 +462,31 @@ pgstat_init(void)
  
         /* Did we find a working address? */
         if (!addr || pgStatSock < 0)
-       {
-               ereport(LOG,
-                               (errcode_for_socket_access(),
-                                errmsg("disabling statistics collector for lack of working socket")));
                 goto startup_failed;
-       }
  
         /*
-        * Set the socket to non-blocking IO.  This ensures that if the
-        * collector falls behind (despite the buffering process), statistics
-        * messages will be discarded; backends won't block waiting to send
-        * messages to the collector.
+        * Set the socket to non-blocking IO.  This ensures that if the collector
+        * falls behind, statistics messages will be discarded; backends won't
+        * block waiting to send messages to the collector.
          */
-       if (!set_noblock(pgStatSock))
+       if (!pg_set_noblock(pgStatSock))
         {
                 ereport(LOG,
                                 (errcode_for_socket_access(),
-               errmsg("could not set statistics collector socket to nonblocking mode: %m")));
+                                errmsg("could not set statistics collector socket to nonblocking mode: %m")));
                 goto startup_failed;
         }
  
-       freeaddrinfo_all(hints.ai_family, addrs);
+       pg_freeaddrinfo_all(hints.ai_family, addrs);
  
         return;
  
  startup_failed:
+       ereport(LOG,
+         (errmsg("disabling statistics collector for lack of working socket")));
+
         if (addrs)
-               freeaddrinfo_all(hints.ai_family, addrs);
+               pg_freeaddrinfo_all(hints.ai_family, addrs);
  
         if (pgStatSock >= 0)
                 closesocket(pgStatSock);
@@ -445,82 +494,46 @@ startup_failed:
  
         /* Adjust GUC variables to suppress useless activity */
         pgstat_collect_startcollector = false;
-       pgstat_collect_querystring = false;
         pgstat_collect_tuplelevel = false;
         pgstat_collect_blocklevel = false;
  }
  
+/*
+ * pgstat_reset_all() -
+ *
+ * Remove the stats file.  This is used on server start if the
+ * stats_reset_on_server_start feature is enabled, or if WAL
+ * recovery is needed after a crash.
+ */
+void
+pgstat_reset_all(void)
+{
+       unlink(PGSTAT_STAT_FILENAME);
+}
  
  #ifdef EXEC_BACKEND
  
  /*
   * pgstat_forkexec() -
   *
- * Format up the arglist for, then fork and exec, statistics
- * (buffer and collector) processes
+ * Format up the arglist for, then fork and exec, statistics collector process
   */
  static pid_t
-pgstat_forkexec(STATS_PROCESS_TYPE procType)
+pgstat_forkexec(void)
  {
-       char *av[10];
-       int ac = 0, bufc = 0, i;
-       char pgstatBuf[2][32];
+       char       *av[10];
+       int                     ac = 0;
  
         av[ac++] = "postgres";
-
-       switch (procType)
-       {
-               case STAT_PROC_BUFFER:
-                       av[ac++] = "-forkbuf";
-                       break;
-
-               case STAT_PROC_COLLECTOR:
-                       av[ac++] = "-forkcol";
-                       break;
-
-               default:
-                       Assert(false);
-       }
-
+       av[ac++] = "--forkcol";
         av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
  
-       /* postgres_exec_path is not passed by write_backend_variables */
-       av[ac++] = postgres_exec_path;
-
-       /* Pipe file ids (those not passed by write_backend_variables) */
-       snprintf(pgstatBuf[bufc++],32,"%d",pgStatPipe[0]);
-       snprintf(pgstatBuf[bufc++],32,"%d",pgStatPipe[1]);
-
-       /* Add to the arg list */
-       Assert(bufc <= lengthof(pgstatBuf));
-       for (i = 0; i < bufc; i++)
-               av[ac++] = pgstatBuf[i];
-
         av[ac] = NULL;
         Assert(ac < lengthof(av));
  
         return postmaster_forkexec(ac, av);
  }
-
-
-/*
- * pgstat_parseArgs() -
- *
- * Extract data from the arglist for exec'ed statistics
- * (buffer and collector) processes
- */
-static void
-pgstat_parseArgs(int argc, char *argv[])
-{
-       Assert(argc == 6);
-
-       argc = 3;
-       StrNCpy(postgres_exec_path,     argv[argc++], MAXPGPATH);
-       pgStatPipe[0]   = atoi(argv[argc++]);
-       pgStatPipe[1]   = atoi(argv[argc++]);
-}
-
-#endif /* EXEC_BACKEND */
+#endif   /* EXEC_BACKEND */
  
  
  /* ----------
@@ -547,11 +560,10 @@ pgstat_start(void)
                 return 0;
  
         /*
-        * Do nothing if too soon since last collector start.  This is a
-        * safety valve to protect against continuous respawn attempts if the
-        * collector is dying immediately at launch.  Note that since we will
-        * be re-called from the postmaster main loop, we will get another
-        * chance later.
+        * Do nothing if too soon since last collector start.  This is a safety
+        * valve to protect against continuous respawn attempts if the collector
+        * is dying immediately at launch.      Note that since we will be re-called
+        * from the postmaster main loop, we will get another chance later.
          */
         curtime = time(NULL);
         if ((unsigned int) (curtime - last_pgstat_start_time) <
@@ -578,44 +590,30 @@ pgstat_start(void)
         /*
          * Okay, fork off the collector.
          */
-
-       fflush(stdout);
-       fflush(stderr);
-
-#ifdef __BEOS__
-       /* Specific beos actions before backend startup */
-       beos_before_backend_startup();
-#endif
-
  #ifdef EXEC_BACKEND
-       switch ((pgStatPid = pgstat_forkexec(STAT_PROC_BUFFER)))
+       switch ((pgStatPid = pgstat_forkexec()))
  #else
-       switch ((pgStatPid = fork()))
+       switch ((pgStatPid = fork_process()))
  #endif
         {
                 case -1:
-#ifdef __BEOS__
-                       /* Specific beos actions */
-                       beos_backend_startup_failed();
-#endif
                         ereport(LOG,
-                                       (errmsg("could not fork statistics buffer: %m")));
+                                       (errmsg("could not fork statistics collector: %m")));
                         return 0;
  
  #ifndef EXEC_BACKEND
                 case 0:
                         /* in postmaster child ... */
-#ifdef __BEOS__
-                       /* Specific beos actions after backend startup */
-                       beos_backend_startup();
-#endif
                         /* Close the postmaster's sockets */
-                       ClosePostmasterPorts();
+                       ClosePostmasterPorts(false);
+
+                       /* Lose the postmaster's on-exit routines */
+                       on_exit_reset();
  
                         /* Drop our connection to postmaster's shared memory, as well */
                         PGSharedMemoryDetach();
  
-                       PgstatBufferMain(0, NULL);
+                       PgstatCollectorMain(0, NULL);
                         break;
  #endif
  
@@ -627,29 +625,11 @@ pgstat_start(void)
         return 0;
  }
  
-
-/* ----------
- * pgstat_beterm() -
- *
- *     Called from postmaster to tell collector a backend terminated.
- * ----------
- */
-void
-pgstat_beterm(int pid)
+void allow_immediate_pgstat_restart(void)
  {
-       PgStat_MsgBeterm msg;
-
-       if (pgStatSock < 0)
-               return;
-
-       MemSet(&(msg.m_hdr), 0, sizeof(msg.m_hdr));
-       msg.m_hdr.m_type = PGSTAT_MTYPE_BETERM;
-       msg.m_hdr.m_procpid = pid;
-
-       pgstat_send(&msg, sizeof(msg));
+               last_pgstat_start_time = 0;
  }
  
-
  /* ------------------------------------------------------------
   * Public functions used by backends follow
   *------------------------------------------------------------
@@ -657,101 +637,138 @@ pgstat_beterm(int pid)
  
  
  /* ----------
- * pgstat_bestart() -
+ * pgstat_report_tabstat() -
   *
- *     Tell the collector that this new backend is soon ready to process
- *     queries. Called from tcop/postgres.c before entering the mainloop.
+ *     Called from tcop/postgres.c to send the so far collected per-table
+ *     access statistics to the collector.  Note that this is called only
+ *     when not within a transaction, so it is fair to use transaction stop
+ *     time as an approximation of current time.
   * ----------
   */
  void
-pgstat_bestart(void)
+pgstat_report_tabstat(bool force)
  {
-       PgStat_MsgBestart msg;
+       /* we assume this inits to all zeroes: */
+       static const PgStat_TableCounts all_zeroes;
+       static TimestampTz last_report = 0;     
+
+       TimestampTz now;
+       PgStat_MsgTabstat regular_msg;
+       PgStat_MsgTabstat shared_msg;
+       TabStatusArray *tsa;
+       int                     i;
  
-       if (pgStatSock < 0)
+       /* Don't expend a clock check if nothing to do */
+       if (pgStatTabList == NULL ||
+               pgStatTabList->tsa_used == 0)
                 return;
  
-       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_BESTART);
-       pgstat_send(&msg, sizeof(msg));
-}
-
+       /*
+        * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
+        * msec since we last sent one, or the caller wants to force stats out.
+        */
+       now = GetCurrentTransactionStopTimestamp();
+       if (!force &&
+               !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL))
+               return;
+       last_report = now;
  
-/* ----------
- * pgstat_report_activity() -
- *
- *     Called in tcop/postgres.c to tell the collector what the backend
- *     is actually doing (usually "<IDLE>" or the start of the query to
- *     be executed).
- * ----------
- */
-void
-pgstat_report_activity(const char *what)
-{
-       PgStat_MsgActivity msg;
-       int                     len;
+       /*
+        * Scan through the TabStatusArray struct(s) to find tables that actually
+        * have counts, and build messages to send.  We have to separate shared
+        * relations from regular ones because the databaseid field in the
+        * message header has to depend on that.
+        */
+       regular_msg.m_databaseid = MyDatabaseId;
+       shared_msg.m_databaseid = InvalidOid;
+       regular_msg.m_nentries = 0;
+       shared_msg.m_nentries = 0;
  
-       if (!pgstat_collect_querystring || pgStatSock < 0)
-               return;
+       for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
+       {
+               for (i = 0; i < tsa->tsa_used; i++)
+               {
+                       PgStat_TableStatus *entry = &tsa->tsa_entries[i];
+                       PgStat_MsgTabstat *this_msg;
+                       PgStat_TableEntry *this_ent;
  
-       len = strlen(what);
-       len = pg_mbcliplen((const unsigned char *) what, len,
-                                          PGSTAT_ACTIVITY_SIZE - 1);
+                       /* Shouldn't have any pending transaction-dependent counts */
+                       Assert(entry->trans == NULL);
  
-       memcpy(msg.m_what, what, len);
-       msg.m_what[len] = '\0';
-       len += offsetof(PgStat_MsgActivity, m_what) +1;
+                       /*
+                        * Ignore entries that didn't accumulate any actual counts,
+                        * such as indexes that were opened by the planner but not used.
+                        */
+                       if (memcmp(&entry->t_counts, &all_zeroes,
+                                          sizeof(PgStat_TableCounts)) == 0)
+                               continue;
+                       /*
+                        * OK, insert data into the appropriate message, and send if full.
+                        */
+                       this_msg = entry->t_shared ? &shared_msg : &regular_msg;
+                       this_ent = &this_msg->m_entry[this_msg->m_nentries];
+                       this_ent->t_id = entry->t_id;
+                       memcpy(&this_ent->t_counts, &entry->t_counts,
+                                  sizeof(PgStat_TableCounts));
+                       if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
+                       {
+                               pgstat_send_tabstat(this_msg);
+                               this_msg->m_nentries = 0;
+                       }
+               }
+               /* zero out TableStatus structs after use */
+               MemSet(tsa->tsa_entries, 0,
+                          tsa->tsa_used * sizeof(PgStat_TableStatus));
+               tsa->tsa_used = 0;
+       }
  
-       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ACTIVITY);
-       pgstat_send(&msg, len);
+       /*
+        * Send partial messages.  If force is true, make sure that any pending
+        * xact commit/abort gets counted, even if no table stats to send.
+        */
+       if (regular_msg.m_nentries > 0 ||
+               (force && (pgStatXactCommit > 0 || pgStatXactRollback > 0)))
+               pgstat_send_tabstat(&regular_msg);
+       if (shared_msg.m_nentries > 0)
+               pgstat_send_tabstat(&shared_msg);
  }
  
-
-/* ----------
- * pgstat_report_tabstat() -
- *
- *     Called from tcop/postgres.c to send the so far collected
- *     per table access statistics to the collector.
- * ----------
+/*
+ * Subroutine for pgstat_report_tabstat: finish and send a tabstat message
   */
-void
-pgstat_report_tabstat(void)
+static void
+pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
  {
-       int                     i;
+       int                     n;
+       int                     len;
  
-       if (pgStatSock < 0 ||
-               !(pgstat_collect_querystring ||
-                 pgstat_collect_tuplelevel ||
-                 pgstat_collect_blocklevel))
-       {
-               /* Not reporting stats, so just flush whatever we have */
-               pgStatTabstatUsed = 0;
+       /* It's unlikely we'd get here with no socket, but maybe not impossible */
+       if (pgStatSock < 0)
                 return;
-       }
  
         /*
-        * For each message buffer used during the last query set the header
-        * fields and send it out.
+        * Report accumulated xact commit/rollback whenever we send a normal
+        * tabstat message
          */
-       for (i = 0; i < pgStatTabstatUsed; i++)
+       if (OidIsValid(tsmsg->m_databaseid))
         {
-               PgStat_MsgTabstat *tsmsg = pgStatTabstatMessages[i];
-               int                     n;
-               int                     len;
-
-               n = tsmsg->m_nentries;
-               len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
-                       n * sizeof(PgStat_TableEntry);
-
                 tsmsg->m_xact_commit = pgStatXactCommit;
                 tsmsg->m_xact_rollback = pgStatXactRollback;
                 pgStatXactCommit = 0;
                 pgStatXactRollback = 0;
-
-               pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
-               pgstat_send(tsmsg, len);
+       }
+       else
+       {
+               tsmsg->m_xact_commit = 0;
+               tsmsg->m_xact_rollback = 0;
         }
  
-       pgStatTabstatUsed = 0;
+       n = tsmsg->m_nentries;
+       len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
+               n * sizeof(PgStat_TableEntry);
+
+       pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
+       pgstat_send(tsmsg, len);
  }
  
  
@@ -761,49 +778,61 @@ pgstat_report_tabstat(void)
   *     Will tell the collector about objects he can get rid of.
   * ----------
   */
-int
+void
  pgstat_vacuum_tabstat(void)
  {
-       Relation        dbrel;
-       HeapScanDesc dbscan;
-       HeapTuple       dbtup;
-       Oid                *dbidlist;
-       int                     dbidalloc;
-       int                     dbidused;
+       HTAB       *htab;
+       PgStat_MsgTabpurge msg;
         HASH_SEQ_STATUS hstat;
         PgStat_StatDBEntry *dbentry;
         PgStat_StatTabEntry *tabentry;
-       HeapTuple       reltup;
-       int                     nobjects = 0;
-       PgStat_MsgTabpurge msg;
         int                     len;
-       int                     i;
  
         if (pgStatSock < 0)
-               return 0;
+               return;
+
+       /*
+        * If not done for this transaction, read the statistics collector stats
+        * file into some hash tables.
+        */
+       backend_read_statsfile();
+
+       /*
+        * Read pg_database and make a list of OIDs of all existing databases
+        */
+       htab = pgstat_collect_oids(DatabaseRelationId);
  
         /*
-        * If not done for this transaction, read the statistics collector
-        * stats file into some hash tables.
+        * Search the database hash table for dead databases and tell the
+        * collector to drop them.
          */
-       if (!TransactionIdEquals(pgStatDBHashXact, GetCurrentTransactionId()))
+       hash_seq_init(&hstat, pgStatDBHash);
+       while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
         {
-               pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
-                                                         &pgStatBeTable, &pgStatNumBackends);
-               pgStatDBHashXact = GetCurrentTransactionId();
+               Oid                     dbid = dbentry->databaseid;
+
+               CHECK_FOR_INTERRUPTS();
+
+               if (hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL)
+                       pgstat_drop_database(dbid);
         }
  
+       /* Clean up */
+       hash_destroy(htab);
+
         /*
-        * Lookup our own database entry
+        * Lookup our own database entry; if not found, nothing more to do.
          */
         dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
                                                                                                  (void *) &MyDatabaseId,
                                                                                                  HASH_FIND, NULL);
-       if (dbentry == NULL)
-               return -1;
+       if (dbentry == NULL || dbentry->tables == NULL)
+               return;
  
-       if (dbentry->tables == NULL)
-               return 0;
+       /*
+        * Similarly to above, make a list of all known relations in this DB.
+        */
+       htab = pgstat_collect_oids(RelationRelationId);
  
         /*
          * Initialize our messages table counter to zero
@@ -811,32 +840,25 @@ pgstat_vacuum_tabstat(void)
         msg.m_nentries = 0;
  
         /*
-        * Check for all tables if they still exist.
+        * Check for all tables listed in stats hashtable if they still exist.
          */
         hash_seq_init(&hstat, dbentry->tables);
         while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
         {
-               /*
-                * Check if this relation is still alive by looking up it's
-                * pg_class tuple in the system catalog cache.
-                */
-               reltup = SearchSysCache(RELOID,
-                                                               ObjectIdGetDatum(tabentry->tableid),
-                                                               0, 0, 0);
-               if (HeapTupleIsValid(reltup))
-               {
-                       ReleaseSysCache(reltup);
+               Oid                     tabid = tabentry->tableid;
+
+               CHECK_FOR_INTERRUPTS();
+
+               if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL)
                         continue;
-               }
  
                 /*
-                * Add this tables Oid to the message
+                * Not there, so add this table's Oid to the message
                  */
-               msg.m_tableid[msg.m_nentries++] = tabentry->tableid;
-               nobjects++;
+               msg.m_tableid[msg.m_nentries++] = tabid;
  
                 /*
-                * If the message is full, send it out and reinitialize ot zero
+                * If the message is full, send it out and reinitialize to empty
                  */
                 if (msg.m_nentries >= PGSTAT_NUM_TABPURGE)
                 {
@@ -844,6 +866,7 @@ pgstat_vacuum_tabstat(void)
                                 +msg.m_nentries * sizeof(Oid);
  
                         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
+                       msg.m_databaseid = MyDatabaseId;
                         pgstat_send(&msg, len);
  
                         msg.m_nentries = 0;
@@ -859,65 +882,55 @@ pgstat_vacuum_tabstat(void)
                         +msg.m_nentries * sizeof(Oid);
  
                 pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
+               msg.m_databaseid = MyDatabaseId;
                 pgstat_send(&msg, len);
         }
  
-       /*
-        * Read pg_database and remember the Oid's of all existing databases
-        */
-       dbidalloc = 256;
-       dbidused = 0;
-       dbidlist = (Oid *) palloc(sizeof(Oid) * dbidalloc);
+       /* Clean up */
+       hash_destroy(htab);
+}
  
-       dbrel = heap_openr(DatabaseRelationName, AccessShareLock);
-       dbscan = heap_beginscan(dbrel, SnapshotNow, 0, NULL);
-       while ((dbtup = heap_getnext(dbscan, ForwardScanDirection)) != NULL)
-       {
-               if (dbidused >= dbidalloc)
-               {
-                       dbidalloc *= 2;
-                       dbidlist = (Oid *) repalloc((char *) dbidlist,
-                                                                               sizeof(Oid) * dbidalloc);
-               }
-               dbidlist[dbidused++] = HeapTupleGetOid(dbtup);
-       }
-       heap_endscan(dbscan);
-       heap_close(dbrel, AccessShareLock);
  
-       /*
-        * Search the database hash table for dead databases and tell the
-        * collector to drop them as well.
-        */
-       hash_seq_init(&hstat, pgStatDBHash);
-       while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
+/* ----------
+ * pgstat_collect_oids() -
+ *
+ *     Collect the OIDs of either all databases or all tables, according to
+ *     the parameter, into a temporary hash table.  Caller should hash_destroy
+ *     the result when done with it.
+ * ----------
+ */
+static HTAB *
+pgstat_collect_oids(Oid catalogid)
+{
+       HTAB       *htab;
+       HASHCTL         hash_ctl;
+       Relation        rel;
+       HeapScanDesc scan;
+       HeapTuple       tup;
+
+       memset(&hash_ctl, 0, sizeof(hash_ctl));
+       hash_ctl.keysize = sizeof(Oid);
+       hash_ctl.entrysize = sizeof(Oid);
+       hash_ctl.hash = oid_hash;
+       htab = hash_create("Temporary table of OIDs",
+                                          PGSTAT_TAB_HASH_SIZE,
+                                          &hash_ctl,
+                                          HASH_ELEM | HASH_FUNCTION);
+
+       rel = heap_open(catalogid, AccessShareLock);
+       scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
+       while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
         {
-               Oid                     dbid = dbentry->databaseid;
+               Oid             thisoid = HeapTupleGetOid(tup);
  
-               for (i = 0; i < dbidused; i++)
-               {
-                       if (dbidlist[i] == dbid)
-                       {
-                               dbid = InvalidOid;
-                               break;
-                       }
-               }
+               CHECK_FOR_INTERRUPTS();
  
-               if (dbid != InvalidOid)
-               {
-                       nobjects++;
-                       pgstat_drop_database(dbid);
-               }
+               (void) hash_search(htab, (void *) &thisoid, HASH_ENTER, NULL);
         }
+       heap_endscan(scan);
+       heap_close(rel, AccessShareLock);
  
-       /*
-        * Free the dbid list.
-        */
-       pfree((char *) dbidlist);
-
-       /*
-        * Tell the caller how many removeable objects we found
-        */
-       return nobjects;
+       return htab;
  }
  
  
@@ -925,12 +938,11 @@ pgstat_vacuum_tabstat(void)
   * pgstat_drop_database() -
   *
   *     Tell the collector that we just dropped a database.
- *     This is the only message that shouldn't get lost in space. Otherwise
- *     the collector will keep the statistics for the dead DB until his
- *     stats file got removed while the postmaster is down.
+ *     (If the message gets lost, we will still clean the dead DB eventually
+ *     via future invocations of pgstat_vacuum_tabstat().)
   * ----------
   */
-static void
+void
  pgstat_drop_database(Oid databaseid)
  {
         PgStat_MsgDropdb msg;
@@ -938,13 +950,40 @@ pgstat_drop_database(Oid databaseid)
         if (pgStatSock < 0)
                 return;
  
-       msg.m_databaseid = databaseid;
-
         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB);
+       msg.m_databaseid = databaseid;
         pgstat_send(&msg, sizeof(msg));
  }
  
  
+/* ----------
+ * pgstat_drop_relation() -
+ *
+ *     Tell the collector that we just dropped a relation.
+ *     (If the message gets lost, we will still clean the dead entry eventually
+ *     via future invocations of pgstat_vacuum_tabstat().)
+ * ----------
+ */
+void
+pgstat_drop_relation(Oid relid)
+{
+       PgStat_MsgTabpurge msg;
+       int                     len;
+
+       if (pgStatSock < 0)
+               return;
+
+       msg.m_tableid[0] = relid;
+       msg.m_nentries = 1;
+
+       len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) +sizeof(Oid);
+
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
+       msg.m_databaseid = MyDatabaseId;
+       pgstat_send(&msg, len);
+}
+
+
  /* ----------
   * pgstat_reset_counters() -
   *
@@ -962,235 +1001,590 @@ pgstat_reset_counters(void)
         if (!superuser())
                 ereport(ERROR,
                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                         errmsg("must be superuser to reset statistics counters")));
+                                errmsg("must be superuser to reset statistics counters")));
  
         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETCOUNTER);
+       msg.m_databaseid = MyDatabaseId;
         pgstat_send(&msg, sizeof(msg));
  }
  
  
  /* ----------
- * pgstat_ping() -
+ * pgstat_report_autovac() -
   *
- *     Send some junk data to the collector to increase traffic.
+ *     Called from autovacuum.c to report startup of an autovacuum process.
+ *     We are called before InitPostgres is done, so can't rely on MyDatabaseId;
+ *     the db OID must be passed in, instead.
   * ----------
   */
  void
-pgstat_ping(void)
+pgstat_report_autovac(Oid dboid)
  {
-       PgStat_MsgDummy msg;
+       PgStat_MsgAutovacStart msg;
  
         if (pgStatSock < 0)
                 return;
  
-       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
+       msg.m_databaseid = dboid;
+       msg.m_start_time = GetCurrentTimestamp();
+
         pgstat_send(&msg, sizeof(msg));
  }
  
-/*
- * Create or enlarge the pgStatTabstatMessages array
+
+/* ---------
+ * pgstat_report_vacuum() -
+ *
+ *     Tell the collector about the table we just vacuumed.
+ * ---------
   */
-static bool
-more_tabstat_space(void)
+void
+pgstat_report_vacuum(Oid tableoid, bool shared,
+                                        bool analyze, PgStat_Counter tuples)
  {
-       PgStat_MsgTabstat *newMessages;
-       PgStat_MsgTabstat **msgArray;
-       int                     newAlloc = pgStatTabstatAlloc + TABSTAT_QUANTUM;
-       int                     i;
+       PgStat_MsgVacuum msg;
  
-       /* Create (another) quantum of message buffers */
-       newMessages = (PgStat_MsgTabstat *)
-               malloc(sizeof(PgStat_MsgTabstat) * TABSTAT_QUANTUM);
-       if (newMessages == NULL)
-       {
-               ereport(LOG,
-                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                errmsg("out of memory")));
-               return false;
-       }
+       if (pgStatSock < 0 ||
+               !pgstat_collect_tuplelevel)
+               return;
  
-       /* Create or enlarge the pointer array */
-       if (pgStatTabstatMessages == NULL)
-               msgArray = (PgStat_MsgTabstat **)
-                       malloc(sizeof(PgStat_MsgTabstat *) * newAlloc);
-       else
-               msgArray = (PgStat_MsgTabstat **)
-                       realloc(pgStatTabstatMessages,
-                                       sizeof(PgStat_MsgTabstat *) * newAlloc);
-       if (msgArray == NULL)
-       {
-               free(newMessages);
-               ereport(LOG,
-                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                errmsg("out of memory")));
-               return false;
-       }
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
+       msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
+       msg.m_tableoid = tableoid;
+       msg.m_analyze = analyze;
+       msg.m_autovacuum = IsAutoVacuumWorkerProcess(); /* is this autovacuum? */
+       msg.m_vacuumtime = GetCurrentTimestamp();
+       msg.m_tuples = tuples;
+       pgstat_send(&msg, sizeof(msg));
+}
+
+/* --------
+ * pgstat_report_analyze() -
+ *
+ *     Tell the collector about the table we just analyzed.
+ * --------
+ */
+void
+pgstat_report_analyze(Oid tableoid, bool shared, PgStat_Counter livetuples,
+                                         PgStat_Counter deadtuples)
+{
+       PgStat_MsgAnalyze msg;
+
+       if (pgStatSock < 0 ||
+               !pgstat_collect_tuplelevel)
+               return;
+
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
+       msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
+       msg.m_tableoid = tableoid;
+       msg.m_autovacuum = IsAutoVacuumWorkerProcess(); /* is this autovacuum? */
+       msg.m_analyzetime = GetCurrentTimestamp();
+       msg.m_live_tuples = livetuples;
+       msg.m_dead_tuples = deadtuples;
+       pgstat_send(&msg, sizeof(msg));
+}
+
+
+/* ----------
+ * pgstat_ping() -
+ *
+ *     Send some junk data to the collector to increase traffic.
+ * ----------
+ */
+void
+pgstat_ping(void)
+{
+       PgStat_MsgDummy msg;
  
-       MemSet(newMessages, 0, sizeof(PgStat_MsgTabstat) * TABSTAT_QUANTUM);
-       for (i = 0; i < TABSTAT_QUANTUM; i++)
-               msgArray[pgStatTabstatAlloc + i] = newMessages++;
-       pgStatTabstatMessages = msgArray;
-       pgStatTabstatAlloc = newAlloc;
+       if (pgStatSock < 0)
+               return;
  
-       return true;
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
+       pgstat_send(&msg, sizeof(msg));
  }
  
+
  /* ----------
   * pgstat_initstats() -
   *
- *     Called from various places usually dealing with initialization
- *     of Relation or Scan structures. The data placed into these
- *     structures from here tell where later to count for buffer reads,
- *     scans and tuples fetched.
+ *     Initialize a relcache entry to count access statistics.
+ *     Called whenever a relation is opened.
+ *
+ *     We assume that a relcache entry's pgstat_info field is zeroed by
+ *     relcache.c when the relcache entry is made; thereafter it is long-lived
+ *     data.  We can avoid repeated searches of the TabStatus arrays when the
+ *     same relation is touched repeatedly within a transaction.
   * ----------
   */
  void
-pgstat_initstats(PgStat_Info *stats, Relation rel)
+pgstat_initstats(Relation rel)
  {
         Oid                     rel_id = rel->rd_id;
-       PgStat_TableEntry *useent;
-       PgStat_MsgTabstat *tsmsg;
-       int                     mb;
-       int                     i;
+       char            relkind = rel->rd_rel->relkind;
  
-       /*
-        * Initialize data not to count at all.
-        */
-       stats->tabentry = NULL;
-       stats->no_stats = FALSE;
-       stats->heap_scan_counted = FALSE;
-       stats->index_scan_counted = FALSE;
+       /* We only count stats for things that have storage */
+       if (!(relkind == RELKIND_RELATION ||
+                 relkind == RELKIND_INDEX ||
+                 relkind == RELKIND_TOASTVALUE))
+       {
+               rel->pgstat_info = NULL;
+               return;
+       }
  
         if (pgStatSock < 0 ||
                 !(pgstat_collect_tuplelevel ||
                   pgstat_collect_blocklevel))
         {
-               stats->no_stats = TRUE;
+               /* We're not counting at all */
+               rel->pgstat_info = NULL;
                 return;
         }
  
         /*
-        * Search the already-used message slots for this relation.
+        * If we already set up this relation in the current transaction,
+        * nothing to do.
          */
-       for (mb = 0; mb < pgStatTabstatUsed; mb++)
-       {
-               tsmsg = pgStatTabstatMessages[mb];
+       if (rel->pgstat_info != NULL &&
+               rel->pgstat_info->t_id == rel_id)
+               return;
+
+       /* Else find or make the PgStat_TableStatus entry, and update link */
+       rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
+}
+
+/*
+ * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
+ */
+static PgStat_TableStatus *
+get_tabstat_entry(Oid rel_id, bool isshared)
+{
+       PgStat_TableStatus *entry;
+       TabStatusArray *tsa;
+       TabStatusArray *prev_tsa;
+       int                     i;
  
-               for (i = tsmsg->m_nentries; --i >= 0; )
+       /*
+        * Search the already-used tabstat slots for this relation.
+        */
+       prev_tsa = NULL;
+       for (tsa = pgStatTabList; tsa != NULL; prev_tsa = tsa, tsa = tsa->tsa_next)
+       {
+               for (i = 0; i < tsa->tsa_used; i++)
                 {
-                       if (tsmsg->m_entry[i].t_id == rel_id)
-                       {
-                               stats->tabentry = (void *) &(tsmsg->m_entry[i]);
-                               return;
-                       }
+                       entry = &tsa->tsa_entries[i];
+                       if (entry->t_id == rel_id)
+                               return entry;
                 }
  
-               if (tsmsg->m_nentries >= PGSTAT_NUM_TABENTRIES)
-                       continue;
-
-               /*
-                * Not found, but found a message buffer with an empty slot
-                * instead. Fine, let's use this one.
-                */
-               i = tsmsg->m_nentries++;
-               useent = &tsmsg->m_entry[i];
-               MemSet(useent, 0, sizeof(PgStat_TableEntry));
-               useent->t_id = rel_id;
-               stats->tabentry = (void *) useent;
-               return;
+               if (tsa->tsa_used < TABSTAT_QUANTUM)
+               {
+                       /*
+                        * It must not be present, but we found a free slot instead.
+                        * Fine, let's use this one.  We assume the entry was already
+                        * zeroed, either at creation or after last use.
+                        */
+                       entry = &tsa->tsa_entries[tsa->tsa_used++];
+                       entry->t_id = rel_id;
+                       entry->t_shared = isshared;
+                       return entry;
+               }
         }
  
         /*
-        * If we ran out of message buffers, we just allocate more.
+        * We ran out of tabstat slots, so allocate more.  Be sure they're zeroed.
+        */
+       tsa = (TabStatusArray *) MemoryContextAllocZero(TopMemoryContext,
+                                                                                                       sizeof(TabStatusArray));
+       if (prev_tsa)
+               prev_tsa->tsa_next = tsa;
+       else
+               pgStatTabList = tsa;
+
+       /*
+        * Use the first entry of the new TabStatusArray.
          */
-       if (pgStatTabstatUsed >= pgStatTabstatAlloc)
+       entry = &tsa->tsa_entries[tsa->tsa_used++];
+       entry->t_id = rel_id;
+       entry->t_shared = isshared;
+       return entry;
+}
+
+/*
+ * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed
+ */
+static PgStat_SubXactStatus *
+get_tabstat_stack_level(int nest_level)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       xact_state = pgStatXactStack;
+       if (xact_state == NULL || xact_state->nest_level != nest_level)
         {
-               if (!more_tabstat_space())
-               {
-                       stats->no_stats = TRUE;
-                       return;
-               }
-               Assert(pgStatTabstatUsed < pgStatTabstatAlloc);
+               xact_state = (PgStat_SubXactStatus *)
+                       MemoryContextAlloc(TopTransactionContext,
+                                                          sizeof(PgStat_SubXactStatus));
+               xact_state->nest_level = nest_level;
+               xact_state->prev = pgStatXactStack;
+               xact_state->first = NULL;
+               pgStatXactStack = xact_state;
         }
+       return xact_state;
+}
+
+/*
+ * add_tabstat_xact_level - add a new (sub)transaction state record
+ */
+static void
+add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
+{
+       PgStat_SubXactStatus *xact_state;
+       PgStat_TableXactStatus *trans;
  
         /*
-        * Use the first entry of the next message buffer.
+        * If this is the first rel to be modified at the current nest level,
+        * we first have to push a transaction stack entry.
          */
-       mb = pgStatTabstatUsed++;
-       tsmsg = pgStatTabstatMessages[mb];
-       tsmsg->m_nentries = 1;
-       useent = &tsmsg->m_entry[0];
-       MemSet(useent, 0, sizeof(PgStat_TableEntry));
-       useent->t_id = rel_id;
-       stats->tabentry = (void *) useent;
+       xact_state = get_tabstat_stack_level(nest_level);
+
+       /* Now make a per-table stack entry */
+       trans = (PgStat_TableXactStatus *)
+               MemoryContextAllocZero(TopTransactionContext,
+                                                          sizeof(PgStat_TableXactStatus));
+       trans->nest_level = nest_level;
+       trans->upper = pgstat_info->trans;
+       trans->parent = pgstat_info;
+       trans->next = xact_state->first;
+       xact_state->first = trans;
+       pgstat_info->trans = trans;
+}
+
+/*
+ * pgstat_count_heap_insert - count a tuple insertion
+ */
+void
+pgstat_count_heap_insert(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+       {
+               int             nest_level = GetCurrentTransactionNestLevel();
+
+               /* t_tuples_inserted is nontransactional, so just advance it */
+               pgstat_info->t_counts.t_tuples_inserted++;
+
+               /* We have to log the transactional effect at the proper level */
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               pgstat_info->trans->tuples_inserted++;
+       }
+}
+
+/*
+ * pgstat_count_heap_update - count a tuple update
+ */
+void
+pgstat_count_heap_update(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+       {
+               int             nest_level = GetCurrentTransactionNestLevel();
+
+               /* t_tuples_updated is nontransactional, so just advance it */
+               pgstat_info->t_counts.t_tuples_updated++;
+
+               /* We have to log the transactional effect at the proper level */
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               /* An UPDATE both inserts a new tuple and deletes the old */
+               pgstat_info->trans->tuples_inserted++;
+               pgstat_info->trans->tuples_deleted++;
+       }
+}
+
+/*
+ * pgstat_count_heap_delete - count a tuple deletion
+ */
+void
+pgstat_count_heap_delete(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+       {
+               int             nest_level = GetCurrentTransactionNestLevel();
+
+               /* t_tuples_deleted is nontransactional, so just advance it */
+               pgstat_info->t_counts.t_tuples_deleted++;
+
+               /* We have to log the transactional effect at the proper level */
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               pgstat_info->trans->tuples_deleted++;
+       }
  }
  
  
  /* ----------
- * pgstat_count_xact_commit() -
+ * AtEOXact_PgStat
   *
- *     Called from access/transam/xact.c to count transaction commits.
+ *     Called from access/transam/xact.c at top-level transaction commit/abort.
   * ----------
   */
  void
-pgstat_count_xact_commit(void)
+AtEOXact_PgStat(bool isCommit)
  {
-       if (!(pgstat_collect_querystring ||
-                 pgstat_collect_tuplelevel ||
-                 pgstat_collect_blocklevel))
-               return;
+       PgStat_SubXactStatus *xact_state;
  
-       pgStatXactCommit++;
+       /*
+        * Count transaction commit or abort.  (We use counters, not just bools,
+        * in case the reporting message isn't sent right away.)
+        */
+       if (isCommit)
+               pgStatXactCommit++;
+       else
+               pgStatXactRollback++;
  
         /*
-        * If there was no relation activity yet, just make one existing
-        * message buffer used without slots, causing the next report to tell
-        * new xact-counters.
+        * Transfer transactional insert/update counts into the base tabstat
+        * entries.  We don't bother to free any of the transactional state,
+        * since it's all in TopTransactionContext and will go away anyway.
          */
-       if (pgStatTabstatAlloc == 0)
-       {
-               if (!more_tabstat_space())
-                       return;
-       }
-       if (pgStatTabstatUsed == 0)
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
         {
-               pgStatTabstatUsed++;
-               pgStatTabstatMessages[0]->m_nentries = 0;
+               PgStat_TableXactStatus *trans;
+
+               Assert(xact_state->nest_level == 1);
+               Assert(xact_state->prev == NULL);
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       Assert(trans->nest_level == 1);
+                       Assert(trans->upper == NULL);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+                       if (isCommit)
+                       {
+                               tabstat->t_counts.t_new_live_tuples += trans->tuples_inserted;
+                               tabstat->t_counts.t_new_dead_tuples += trans->tuples_deleted;
+                       }
+                       else
+                       {
+                               /* inserted tuples are dead, deleted tuples are unaffected */
+                               tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
+                       }
+                       tabstat->trans = NULL;
+               }
         }
-}
+       pgStatXactStack = NULL;
  
+       /* Make sure any stats snapshot is thrown away */
+       pgstat_clear_snapshot();
+}
  
  /* ----------
- * pgstat_count_xact_rollback() -
+ * AtEOSubXact_PgStat
   *
- *     Called from access/transam/xact.c to count transaction rollbacks.
+ *     Called from access/transam/xact.c at subtransaction commit/abort.
   * ----------
   */
  void
-pgstat_count_xact_rollback(void)
+AtEOSubXact_PgStat(bool isCommit, int nestDepth)
  {
-       if (!(pgstat_collect_querystring ||
-                 pgstat_collect_tuplelevel ||
-                 pgstat_collect_blocklevel))
-               return;
-
-       pgStatXactRollback++;
+       PgStat_SubXactStatus *xact_state;
  
         /*
-        * If there was no relation activity yet, just make one existing
-        * message buffer used without slots, causing the next report to tell
-        * new xact-counters.
+        * Transfer transactional insert/update counts into the next higher
+        * subtransaction state.
          */
-       if (pgStatTabstatAlloc == 0)
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL &&
+               xact_state->nest_level >= nestDepth)
+       {
+               PgStat_TableXactStatus *trans;
+               PgStat_TableXactStatus *next_trans;
+
+               /* delink xact_state from stack immediately to simplify reuse case */
+               pgStatXactStack = xact_state->prev;
+
+               for (trans = xact_state->first; trans != NULL; trans = next_trans)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       next_trans = trans->next;
+                       Assert(trans->nest_level == nestDepth);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+                       if (isCommit)
+                       {
+                               if (trans->upper && trans->upper->nest_level == nestDepth - 1)
+                               {
+                                       trans->upper->tuples_inserted += trans->tuples_inserted;
+                                       trans->upper->tuples_deleted += trans->tuples_deleted;
+                                       tabstat->trans = trans->upper;
+                                       pfree(trans);
+                               }
+                               else
+                               {
+                                       /*
+                                        * When there isn't an immediate parent state, we can
+                                        * just reuse the record instead of going through a
+                                        * palloc/pfree pushup (this works since it's all in
+                                        * TopTransactionContext anyway).  We have to re-link
+                                        * it into the parent level, though, and that might mean
+                                        * pushing a new entry into the pgStatXactStack.
+                                        */
+                                       PgStat_SubXactStatus *upper_xact_state;
+
+                                       upper_xact_state = get_tabstat_stack_level(nestDepth - 1);
+                                       trans->next = upper_xact_state->first;
+                                       upper_xact_state->first = trans;
+                                       trans->nest_level = nestDepth - 1;
+                               }
+                       }
+                       else
+                       {
+                               /*
+                                * On abort, inserted tuples are dead (and can be bounced out
+                                * to the top-level tabstat), deleted tuples are unaffected
+                                */
+                               tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
+                               tabstat->trans = trans->upper;
+                               pfree(trans);
+                       }
+               }
+               pfree(xact_state);
+       }
+}
+
+
+/*
+ * AtPrepare_PgStat
+ *             Save the transactional stats state at 2PC transaction prepare.
+ *
+ * In this phase we just generate 2PC records for all the pending
+ * transaction-dependent stats work.
+ */
+void
+AtPrepare_PgStat(void)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
         {
-               if (!more_tabstat_space())
-                       return;
+               PgStat_TableXactStatus *trans;
+
+               Assert(xact_state->nest_level == 1);
+               Assert(xact_state->prev == NULL);
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+                       TwoPhasePgStatRecord record;
+
+                       Assert(trans->nest_level == 1);
+                       Assert(trans->upper == NULL);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+
+                       record.tuples_inserted = trans->tuples_inserted;
+                       record.tuples_deleted = trans->tuples_deleted;
+                       record.t_id = tabstat->t_id;
+                       record.t_shared = tabstat->t_shared;
+
+                       RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0,
+                                                                  &record, sizeof(TwoPhasePgStatRecord));
+               }
         }
-       if (pgStatTabstatUsed == 0)
+}
+
+/*
+ * PostPrepare_PgStat
+ *             Clean up after successful PREPARE.
+ *
+ * All we need do here is unlink the transaction stats state from the
+ * nontransactional state.  The nontransactional action counts will be
+ * reported to the stats collector immediately, while the effects on live
+ * and dead tuple counts are preserved in the 2PC state file.
+ *
+ * Note: AtEOXact_PgStat is not called during PREPARE.
+ */
+void
+PostPrepare_PgStat(void)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       /*
+        * We don't bother to free any of the transactional state,
+        * since it's all in TopTransactionContext and will go away anyway.
+        */
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
         {
-               pgStatTabstatUsed++;
-               pgStatTabstatMessages[0]->m_nentries = 0;
+               PgStat_TableXactStatus *trans;
+
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       tabstat = trans->parent;
+                       tabstat->trans = NULL;
+               }
         }
+       pgStatXactStack = NULL;
+
+       /* Make sure any stats snapshot is thrown away */
+       pgstat_clear_snapshot();
+}
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state.
+ */
+void
+pgstat_twophase_postcommit(TransactionId xid, uint16 info,
+                                                  void *recdata, uint32 len)
+{
+       TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+       PgStat_TableStatus *pgstat_info;
+
+       /* Find or create a tabstat entry for the rel */
+       pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+       pgstat_info->t_counts.t_new_live_tuples += rec->tuples_inserted;
+       pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_deleted;
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state, but treat them
+ * as aborted.
+ */
+void
+pgstat_twophase_postabort(TransactionId xid, uint16 info,
+                                                 void *recdata, uint32 len)
+{
+       TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+       PgStat_TableStatus *pgstat_info;
+
+       /* Find or create a tabstat entry for the rel */
+       pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+       /* inserted tuples are dead, deleted tuples are no-ops */
+       pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_inserted;
  }
  
  
@@ -1206,30 +1600,18 @@ pgstat_count_xact_rollback(void)
  PgStat_StatDBEntry *
  pgstat_fetch_stat_dbentry(Oid dbid)
  {
-       PgStat_StatDBEntry *dbentry;
-
         /*
-        * If not done for this transaction, read the statistics collector
-        * stats file into some hash tables. Be careful with the
-        * read_statsfile() call below!
+        * If not done for this transaction, read the statistics collector stats
+        * file into some hash tables.
          */
-       if (!TransactionIdEquals(pgStatDBHashXact, GetCurrentTransactionId()))
-       {
-               pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
-                                                         &pgStatBeTable, &pgStatNumBackends);
-               pgStatDBHashXact = GetCurrentTransactionId();
-       }
+       backend_read_statsfile();
  
         /*
-        * Lookup the requested database
+        * Lookup the requested database; return NULL if not found
          */
-       dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                                                                                (void *) &dbid,
-                                                                                                HASH_FIND, NULL);
-       if (dbentry == NULL)
-               return NULL;
-
-       return dbentry;
+       return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
+                                                                                         (void *) &dbid,
+                                                                                         HASH_FIND, NULL);
  }
  
  
@@ -1245,42 +1627,49 @@ pgstat_fetch_stat_dbentry(Oid dbid)
  PgStat_StatTabEntry *
  pgstat_fetch_stat_tabentry(Oid relid)
  {
+       Oid                     dbid;
         PgStat_StatDBEntry *dbentry;
         PgStat_StatTabEntry *tabentry;
  
         /*
-        * If not done for this transaction, read the statistics collector
-        * stats file into some hash tables. Be careful with the
-        * read_statsfile() call below!
+        * If not done for this transaction, read the statistics collector stats
+        * file into some hash tables.
          */
-       if (!TransactionIdEquals(pgStatDBHashXact, GetCurrentTransactionId()))
-       {
-               pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
-                                                         &pgStatBeTable, &pgStatNumBackends);
-               pgStatDBHashXact = GetCurrentTransactionId();
-       }
+       backend_read_statsfile();
  
         /*
-        * Lookup our database.
+        * Lookup our database, then look in its table hash table.
          */
+       dbid = MyDatabaseId;
         dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                                                                                (void *) &MyDatabaseId,
+                                                                                                (void *) &dbid,
                                                                                                  HASH_FIND, NULL);
-       if (dbentry == NULL)
-               return NULL;
+       if (dbentry != NULL && dbentry->tables != NULL)
+       {
+               tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
+                                                                                                          (void *) &relid,
+                                                                                                          HASH_FIND, NULL);
+               if (tabentry)
+                       return tabentry;
+       }
  
         /*
-        * Now inside the DB's table hash table lookup the requested one.
+        * If we didn't find it, maybe it's a shared table.
          */
-       if (dbentry->tables == NULL)
-               return NULL;
-       tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-                                                                                                  (void *) &relid,
-                                                                                                  HASH_FIND, NULL);
-       if (tabentry == NULL)
-               return NULL;
+       dbid = InvalidOid;
+       dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
+                                                                                                (void *) &dbid,
+                                                                                                HASH_FIND, NULL);
+       if (dbentry != NULL && dbentry->tables != NULL)
+       {
+               tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
+                                                                                                          (void *) &relid,
+                                                                                                          HASH_FIND, NULL);
+               if (tabentry)
+                       return tabentry;
+       }
  
-       return tabentry;
+       return NULL;
  }
  
  
@@ -1288,25 +1677,21 @@ pgstat_fetch_stat_tabentry(Oid relid)
   * pgstat_fetch_stat_beentry() -
   *
   *     Support function for the SQL-callable pgstat* functions. Returns
- *     the actual activity slot of one active backend. The caller is
- *     responsible for a check if the actual user is permitted to see
- *     that info (especially the querystring).
+ *     our local copy of the current-activity entry for one backend.
+ *
+ *     NB: caller is responsible for a check if the user is permitted to see
+ *     this info (especially the querystring).
   * ----------
   */
-PgStat_StatBeEntry *
+PgBackendStatus *
  pgstat_fetch_stat_beentry(int beid)
  {
-       if (!TransactionIdEquals(pgStatDBHashXact, GetCurrentTransactionId()))
-       {
-               pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
-                                                         &pgStatBeTable, &pgStatNumBackends);
-               pgStatDBHashXact = GetCurrentTransactionId();
-       }
+       pgstat_read_current_status();
  
-       if (beid < 1 || beid > pgStatNumBackends)
+       if (beid < 1 || beid > localNumBackends)
                 return NULL;
  
-       return &pgStatBeTable[beid - 1];
+       return &localBackendStatusTable[beid - 1];
  }
  
  
@@ -1320,617 +1705,612 @@ pgstat_fetch_stat_beentry(int beid)
  int
  pgstat_fetch_stat_numbackends(void)
  {
-       if (!TransactionIdEquals(pgStatDBHashXact, GetCurrentTransactionId()))
-       {
-               pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
-                                                         &pgStatBeTable, &pgStatNumBackends);
-               pgStatDBHashXact = GetCurrentTransactionId();
-       }
+       pgstat_read_current_status();
  
-       return pgStatNumBackends;
+       return localNumBackends;
  }
  
+/*
+ * ---------
+ * pgstat_fetch_global() -
+ *
+ *  Support function for the SQL-callable pgstat* functions. Returns
+ *  a pointer to the global statistics struct.
+ * ---------
+ */
+PgStat_GlobalStats *
+pgstat_fetch_global(void)
+{
+       backend_read_statsfile();
+
+       return &globalStats;
+}
  
  
  /* ------------------------------------------------------------
- * Local support functions follow
+ * Functions for management of the shared-memory PgBackendStatus array
   * ------------------------------------------------------------
   */
  
+static PgBackendStatus *BackendStatusArray = NULL;
+static PgBackendStatus *MyBEEntry = NULL;
  
-/* ----------
- * pgstat_setheader() -
- *
- *             Set common header fields in a statistics message
- * ----------
+
+/*
+ * Report shared-memory space needed by CreateSharedBackendStatus.
   */
-static void
-pgstat_setheader(PgStat_MsgHdr *hdr, int mtype)
+Size
+BackendStatusShmemSize(void)
  {
-       hdr->m_type = mtype;
-       hdr->m_backendid = MyBackendId;
-       hdr->m_procpid = MyProcPid;
-       hdr->m_databaseid = MyDatabaseId;
-       hdr->m_userid = GetSessionUserId();
-}
+       Size            size;
  
+       size = mul_size(sizeof(PgBackendStatus), MaxBackends);
+       return size;
+}
  
-/* ----------
- * pgstat_send() -
- *
- *             Send out one statistics message to the collector
- * ----------
+/*
+ * Initialize the shared status array during postmaster startup.
   */
-static void
-pgstat_send(void *msg, int len)
+void
+CreateSharedBackendStatus(void)
  {
-       if (pgStatSock < 0)
-               return;
+       Size            size = BackendStatusShmemSize();
+       bool            found;
  
-       ((PgStat_MsgHdr *) msg)->m_size = len;
+       /* Create or attach to the shared array */
+       BackendStatusArray = (PgBackendStatus *)
+               ShmemInitStruct("Backend Status Array", size, &found);
  
-       send(pgStatSock, msg, len, 0);
-       /* We deliberately ignore any error from send() */
+       if (!found)
+       {
+               /*
+                * We're the first - initialize.
+                */
+               MemSet(BackendStatusArray, 0, size);
+       }
  }
  
  
  /* ----------
- * PgstatBufferMain() -
- *
- *     Start up the statistics buffer process.  This is the body of the
- *     postmaster child process.
+ * pgstat_bestart() -
   *
- *     The argc/argv parameters are valid only in EXEC_BACKEND case.
+ *     Initialize this backend's entry in the PgBackendStatus array,
+ *     and set up an on-proc-exit hook that will clear it again.
+ *     Called from InitPostgres.  MyBackendId and MyDatabaseId must be set.
   * ----------
   */
-NON_EXEC_STATIC void
-PgstatBufferMain(int argc, char *argv[])
+void
+pgstat_bestart(void)
  {
-       IsUnderPostmaster = true;       /* we are a postmaster subprocess now */
+       volatile PgBackendStatus *beentry;
+       TimestampTz proc_start_timestamp;
+       Oid                     userid;
+       SockAddr        clientaddr;
  
-       MyProcPid = getpid();           /* reset MyProcPid */
-
-       /* Lose the postmaster's on-exit routines */
-       on_exit_reset();
+       Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends);
+       MyBEEntry = &BackendStatusArray[MyBackendId - 1];
  
         /*
-        * Ignore all signals usually bound to some action in the postmaster,
-        * except for SIGCHLD and SIGQUIT --- see pgstat_recvbuffer.
+        * To minimize the time spent modifying the entry, fetch all the needed
+        * data first.
+        *
+        * If we have a MyProcPort, use its session start time (for consistency,
+        * and to save a kernel call).
          */
-       pqsignal(SIGHUP, SIG_IGN);
-       pqsignal(SIGINT, SIG_IGN);
-       pqsignal(SIGTERM, SIG_IGN);
-       pqsignal(SIGQUIT, pgstat_exit);
-       pqsignal(SIGALRM, SIG_IGN);
-       pqsignal(SIGPIPE, SIG_IGN);
-       pqsignal(SIGUSR1, SIG_IGN);
-       pqsignal(SIGUSR2, SIG_IGN);
-       pqsignal(SIGCHLD, pgstat_die);
-       pqsignal(SIGTTIN, SIG_DFL);
-       pqsignal(SIGTTOU, SIG_DFL);
-       pqsignal(SIGCONT, SIG_DFL);
-       pqsignal(SIGWINCH, SIG_DFL);
-       /* unblock will happen in pgstat_recvbuffer */
+       if (MyProcPort)
+               proc_start_timestamp = MyProcPort->SessionStartTime;
+       else
+               proc_start_timestamp = GetCurrentTimestamp();
+       userid = GetSessionUserId();
  
-#ifdef EXEC_BACKEND
-       pgstat_parseArgs(argc,argv);
-#endif
+       /*
+        * We may not have a MyProcPort (eg, if this is the autovacuum process).
+        * If so, use all-zeroes client address, which is dealt with specially in
+        * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port.
+        */
+       if (MyProcPort)
+               memcpy(&clientaddr, &MyProcPort->raddr, sizeof(clientaddr));
+       else
+               MemSet(&clientaddr, 0, sizeof(clientaddr));
  
         /*
-        * Start a buffering process to read from the socket, so we have a
-        * little more time to process incoming messages.
-        *
-        * NOTE: the process structure is: postmaster is parent of buffer process
-        * is parent of collector process.      This way, the buffer can detect
-        * collector failure via SIGCHLD, whereas otherwise it wouldn't notice
-        * collector failure until it tried to write on the pipe.  That would
-        * mean that after the postmaster started a new collector, we'd have
-        * two buffer processes competing to read from the UDP socket --- not
-        * good.
-        */
-       if (pgpipe(pgStatPipe) < 0)
+        * Initialize my status entry, following the protocol of bumping
+        * st_changecount before and after; and make sure it's even afterwards. We
+        * use a volatile pointer here to ensure the compiler doesn't try to get
+        * cute.
+        */
+       beentry = MyBEEntry;
+       do
         {
-               ereport(LOG,
-                               (errcode_for_socket_access(),
-                        errmsg("could not create pipe for statistics buffer: %m")));
-               exit(1);
-       }
+               beentry->st_changecount++;
+       } while ((beentry->st_changecount & 1) == 0);
  
-#ifdef EXEC_BACKEND
-       /* child becomes collector process */
-       switch (pgstat_forkexec(STAT_PROC_COLLECTOR))
-#else
-       switch (fork())
-#endif
-       {
-               case -1:
-                       ereport(LOG,
-                                       (errmsg("could not fork statistics collector: %m")));
-                       exit(1);
+       beentry->st_procpid = MyProcPid;
+       beentry->st_proc_start_timestamp = proc_start_timestamp;
+       beentry->st_activity_start_timestamp = 0;
+       beentry->st_txn_start_timestamp = 0;
+       beentry->st_databaseid = MyDatabaseId;
+       beentry->st_userid = userid;
+       beentry->st_clientaddr = clientaddr;
+       beentry->st_waiting = false;
+       beentry->st_activity[0] = '\0';
+       /* Also make sure the last byte in the string area is always 0 */
+       beentry->st_activity[PGBE_ACTIVITY_SIZE - 1] = '\0';
  
-#ifndef EXEC_BACKEND
-               case 0:
-                       /* child becomes collector process */
-                       PgstatCollectorMain(0, NULL);
-                       break;
-#endif
+       beentry->st_changecount++;
+       Assert((beentry->st_changecount & 1) == 0);
  
-               default:
-                       /* parent becomes buffer process */
-                       closesocket(pgStatPipe[0]);
-                       pgstat_recvbuffer();
-       }
-       exit(0);
+       /*
+        * Set up a process-exit hook to clean up.
+        */
+       on_shmem_exit(pgstat_beshutdown_hook, 0);
  }
  
-
-/* ----------
- * PgstatCollectorMain() -
+/*
+ * Shut down a single backend's statistics reporting at process exit.
   *
- *     Start up the statistics collector itself.  This is the body of the
- *     postmaster grandchild process.
+ * Flush any remaining statistics counts out to the collector.
+ * Without this, operations triggered during backend exit (such as
+ * temp table deletions) won't be counted.
   *
- *     The argc/argv parameters are valid only in EXEC_BACKEND case.
- * ----------
+ * Lastly, clear out our entry in the PgBackendStatus array.
   */
-NON_EXEC_STATIC void
-PgstatCollectorMain(int argc, char *argv[])
+static void
+pgstat_beshutdown_hook(int code, Datum arg)
  {
-       PgStat_Msg      msg;
-       fd_set          rfds;
-       int                     readPipe;
-       int                     nready;
-       int                     len = 0;
-       struct timeval timeout;
-       struct timeval next_statwrite;
-       bool            need_statwrite;
-       HASHCTL         hash_ctl;
+       volatile PgBackendStatus *beentry = MyBEEntry;
  
-       MyProcPid = getpid();           /* reset MyProcPid */
+       pgstat_report_tabstat(true);
  
         /*
-        * Reset signal handling.  With the exception of restoring default
-        * SIGCHLD and SIGQUIT handling, this is a no-op in the non-EXEC_BACKEND
-        * case because we'll have inherited these settings from the buffer
-        * process; but it's not a no-op for EXEC_BACKEND.
+        * Clear my status entry, following the protocol of bumping st_changecount
+        * before and after.  We use a volatile pointer here to ensure the
+        * compiler doesn't try to get cute.
          */
-       pqsignal(SIGHUP, SIG_IGN);
-       pqsignal(SIGINT, SIG_IGN);
-       pqsignal(SIGTERM, SIG_IGN);
-       pqsignal(SIGQUIT, SIG_IGN);
-       pqsignal(SIGALRM, SIG_IGN);
-       pqsignal(SIGPIPE, SIG_IGN);
-       pqsignal(SIGUSR1, SIG_IGN);
-       pqsignal(SIGUSR2, SIG_IGN);
-       pqsignal(SIGCHLD, SIG_DFL);
-       pqsignal(SIGTTIN, SIG_DFL);
-       pqsignal(SIGTTOU, SIG_DFL);
-       pqsignal(SIGCONT, SIG_DFL);
-       pqsignal(SIGWINCH, SIG_DFL);
-       PG_SETMASK(&UnBlockSig);
+       beentry->st_changecount++;
  
-#ifdef EXEC_BACKEND
-       pgstat_parseArgs(argc,argv);
-#endif
+       beentry->st_procpid = 0;        /* mark invalid */
  
-       /* Close unwanted files */
-       closesocket(pgStatPipe[1]);
-       closesocket(pgStatSock);
+       beentry->st_changecount++;
+       Assert((beentry->st_changecount & 1) == 0);
+}
  
-       /*
-        * Identify myself via ps
-        */
-       init_ps_display("stats collector process", "", "");
-       set_ps_display("");
  
-       /*
-        * Initialize filenames needed for status reports.
-        */
-       snprintf(pgStat_fname, MAXPGPATH, PGSTAT_STAT_FILENAME, DataDir);
-       /* tmpfname need only be set correctly in this process */
-       snprintf(pgStat_tmpfname, MAXPGPATH, PGSTAT_STAT_TMPFILE,
-                        DataDir, getpid());
+/* ----------
+ * pgstat_report_activity() -
+ *
+ *     Called from tcop/postgres.c to report what the backend is actually doing
+ *     (usually "<IDLE>" or the start of the query to be executed).
+ * ----------
+ */
+void
+pgstat_report_activity(const char *cmd_str)
+{
+       volatile PgBackendStatus *beentry = MyBEEntry;
+       TimestampTz start_timestamp;
+       int                     len;
  
-       /*
-        * Arrange to write the initial status file right away
-        */
-       gettimeofday(&next_statwrite, NULL);
-       need_statwrite = TRUE;
+       if (!pgstat_collect_querystring || !beentry)
+               return;
  
         /*
-        * Read in an existing statistics stats file or initialize the stats
-        * to zero.
+        * To minimize the time spent modifying the entry, fetch all the needed
+        * data first.
          */
-       pgStatRunningInCollector = TRUE;
-       pgstat_read_statsfile(&pgStatDBHash, InvalidOid, NULL, NULL);
+       start_timestamp = GetCurrentStatementStartTimestamp();
+
+       len = strlen(cmd_str);
+       len = pg_mbcliplen(cmd_str, len, PGBE_ACTIVITY_SIZE - 1);
  
         /*
-        * Create the dead backend hashtable
+        * Update my status entry, following the protocol of bumping
+        * st_changecount before and after.  We use a volatile pointer here to
+        * ensure the compiler doesn't try to get cute.
          */
-       memset(&hash_ctl, 0, sizeof(hash_ctl));
-       hash_ctl.keysize = sizeof(int);
-       hash_ctl.entrysize = sizeof(PgStat_StatBeDead);
-       hash_ctl.hash = tag_hash;
-       pgStatBeDead = hash_create("Dead Backends", PGSTAT_BE_HASH_SIZE,
-                                                          &hash_ctl, HASH_ELEM | HASH_FUNCTION);
-       if (pgStatBeDead == NULL)
-       {
-               /* assume the problem is out-of-memory */
-               ereport(LOG,
-                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                errmsg("out of memory in statistics collector --- abort")));
-               exit(1);
-       }
+       beentry->st_changecount++;
+
+       beentry->st_activity_start_timestamp = start_timestamp;
+       memcpy((char *) beentry->st_activity, cmd_str, len);
+       beentry->st_activity[len] = '\0';
+
+       beentry->st_changecount++;
+       Assert((beentry->st_changecount & 1) == 0);
+}
+
+/*
+ * Set the current transaction start timestamp to the specified
+ * value. If there is no current active transaction, this is signified
+ * by 0.
+ */
+void
+pgstat_report_txn_timestamp(TimestampTz tstamp)
+{
+       volatile PgBackendStatus *beentry = MyBEEntry;
+
+       if (!pgstat_collect_querystring || !beentry)
+               return;
  
         /*
-        * Create the known backends table
+        * Update my status entry, following the protocol of bumping
+        * st_changecount before and after.  We use a volatile pointer
+        * here to ensure the compiler doesn't try to get cute.
          */
-       pgStatBeTable = (PgStat_StatBeEntry *) malloc(
-                                                          sizeof(PgStat_StatBeEntry) * MaxBackends);
-       if (pgStatBeTable == NULL)
-       {
-               ereport(LOG,
-                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                errmsg("out of memory in statistics collector --- abort")));
-               exit(1);
-       }
-       memset(pgStatBeTable, 0, sizeof(PgStat_StatBeEntry) * MaxBackends);
+       beentry->st_changecount++;
+       beentry->st_txn_start_timestamp = tstamp;
+       beentry->st_changecount++;
+       Assert((beentry->st_changecount & 1) == 0);
+}
+
+/* ----------
+ * pgstat_report_waiting() -
+ *
+ *     Called from lock manager to report beginning or end of a lock wait.
+ *
+ * NB: this *must* be able to survive being called before MyBEEntry has been
+ * initialized.
+ * ----------
+ */
+void
+pgstat_report_waiting(bool waiting)
+{
+       volatile PgBackendStatus *beentry = MyBEEntry;
  
-       readPipe = pgStatPipe[0];
+       if (!pgstat_collect_querystring || !beentry)
+               return;
  
         /*
-        * Process incoming messages and handle all the reporting stuff until
-        * there are no more messages.
+        * Since this is a single-byte field in a struct that only this process
+        * may modify, there seems no need to bother with the st_changecount
+        * protocol.  The update must appear atomic in any case.
          */
-       for (;;)
-       {
-               /*
-                * If we need to write the status file again (there have been
-                * changes in the statistics since we wrote it last) calculate the
-                * timeout until we have to do so.
-                */
-               if (need_statwrite)
-               {
-                       struct timeval now;
+       beentry->st_waiting = waiting;
+}
  
-                       gettimeofday(&now, NULL);
-                       /* avoid assuming that tv_sec is signed */
-                       if (now.tv_sec > next_statwrite.tv_sec ||
-                               (now.tv_sec == next_statwrite.tv_sec &&
-                                now.tv_usec >= next_statwrite.tv_usec))
-                       {
-                               timeout.tv_sec = 0;
-                               timeout.tv_usec = 0;
-                       }
-                       else
-                       {
-                               timeout.tv_sec = next_statwrite.tv_sec - now.tv_sec;
-                               timeout.tv_usec = next_statwrite.tv_usec - now.tv_usec;
-                               if (timeout.tv_usec < 0)
-                               {
-                                       timeout.tv_sec--;
-                                       timeout.tv_usec += 1000000;
-                               }
-                       }
-               }
  
-               /*
-                * Setup the descriptor set for select(2)
-                */
-               FD_ZERO(&rfds);
-               FD_SET(readPipe, &rfds);
+/* ----------
+ * pgstat_read_current_status() -
+ *
+ *     Copy the current contents of the PgBackendStatus array to local memory,
+ *     if not already done in this transaction.
+ * ----------
+ */
+static void
+pgstat_read_current_status(void)
+{
+       volatile PgBackendStatus *beentry;
+       PgBackendStatus *localtable;
+       PgBackendStatus *localentry;
+       int                     i;
  
-               /*
-                * Now wait for something to do.
-                */
-               nready = select(readPipe+1, &rfds, NULL, NULL,
-                                               (need_statwrite) ? &timeout : NULL);
-               if (nready < 0)
-               {
-                       if (errno == EINTR)
-                               continue;
-                       ereport(LOG,
-                                       (errcode_for_socket_access(),
-                                  errmsg("select() failed in statistics collector: %m")));
-                       exit(1);
-               }
+       Assert(!pgStatRunningInCollector);
+       if (localBackendStatusTable)
+               return;                                 /* already done */
  
-               /*
-                * If there are no descriptors ready, our timeout for writing the
-                * stats file happened.
-                */
-               if (nready == 0)
-               {
-                       pgstat_write_statsfile();
-                       need_statwrite = FALSE;
+       pgstat_setup_memcxt();
  
-                       continue;
-               }
+       localtable = (PgBackendStatus *)
+               MemoryContextAlloc(pgStatLocalContext,
+                                                  sizeof(PgBackendStatus) * MaxBackends);
+       localNumBackends = 0;
  
+       beentry = BackendStatusArray;
+       localentry = localtable;
+       for (i = 1; i <= MaxBackends; i++)
+       {
                 /*
-                * Check if there is a new statistics message to collect.
+                * Follow the protocol of retrying if st_changecount changes while we
+                * copy the entry, or if it's odd.  (The check for odd is needed to
+                * cover the case where we are able to completely copy the entry while
+                * the source backend is between increment steps.)      We use a volatile
+                * pointer here to ensure the compiler doesn't try to get cute.
                  */
-               if (FD_ISSET(readPipe, &rfds))
+               for (;;)
                 {
-                       /*
-                        * We may need to issue multiple read calls in case the buffer
-                        * process didn't write the message in a single write, which
-                        * is possible since it dumps its buffer bytewise. In any
-                        * case, we'd need two reads since we don't know the message
-                        * length initially.
-                        */
-                       int                     nread = 0;
-                       int                     targetlen = sizeof(PgStat_MsgHdr);              /* initial */
-                       bool            pipeEOF = false;
-
-                       while (nread < targetlen)
-                       {
-                               len = piperead(readPipe, ((char *) &msg) + nread,
-                                                               targetlen - nread);
-                               if (len < 0)
-                               {
-                                       if (errno == EINTR)
-                                               continue;
-                                       ereport(LOG,
-                                                       (errcode_for_socket_access(),
-                                                        errmsg("could not read from statistics collector pipe: %m")));
-                                       exit(1);
-                               }
-                               if (len == 0)   /* EOF on the pipe! */
-                               {
-                                       pipeEOF = true;
-                                       break;
-                               }
-                               nread += len;
-                               if (nread == sizeof(PgStat_MsgHdr))
-                               {
-                                       /* we have the header, compute actual msg length */
-                                       targetlen = msg.msg_hdr.m_size;
-                                       if (targetlen < (int) sizeof(PgStat_MsgHdr) ||
-                                               targetlen > (int) sizeof(msg))
-                                       {
-                                               /*
-                                                * Bogus message length implies that we got out of
-                                                * sync with the buffer process somehow. Abort so
-                                                * that we can restart both processes.
-                                                */
-                                               ereport(LOG,
-                                                 (errmsg("invalid statistics message length")));
-                                               exit(1);
-                                       }
-                               }
-                       }
+                       int                     save_changecount = beentry->st_changecount;
  
                         /*
-                        * EOF on the pipe implies that the buffer process exited.
-                        * Fall out of outer loop.
+                        * XXX if PGBE_ACTIVITY_SIZE is really large, it might be best to
+                        * use strcpy not memcpy for copying the activity string?
                          */
-                       if (pipeEOF)
+                       memcpy(localentry, (char *) beentry, sizeof(PgBackendStatus));
+
+                       if (save_changecount == beentry->st_changecount &&
+                               (save_changecount & 1) == 0)
                                 break;
  
-                       /*
-                        * Distribute the message to the specific function handling
-                        * it.
-                        */
-                       switch (msg.msg_hdr.m_type)
-                       {
-                               case PGSTAT_MTYPE_DUMMY:
-                                       break;
+                       /* Make sure we can break out of loop if stuck... */
+                       CHECK_FOR_INTERRUPTS();
+               }
  
-                               case PGSTAT_MTYPE_BESTART:
-                                       pgstat_recv_bestart((PgStat_MsgBestart *) &msg, nread);
-                                       break;
+               beentry++;
+               /* Only valid entries get included into the local array */
+               if (localentry->st_procpid > 0)
+               {
+                       localentry++;
+                       localNumBackends++;
+               }
+       }
  
-                               case PGSTAT_MTYPE_BETERM:
-                                       pgstat_recv_beterm((PgStat_MsgBeterm *) &msg, nread);
-                                       break;
+       /* Set the pointer only after completion of a valid table */
+       localBackendStatusTable = localtable;
+}
  
-                               case PGSTAT_MTYPE_TABSTAT:
-                                       pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, nread);
-                                       break;
  
-                               case PGSTAT_MTYPE_TABPURGE:
-                                       pgstat_recv_tabpurge((PgStat_MsgTabpurge *) &msg, nread);
-                                       break;
+/* ------------------------------------------------------------
+ * Local support functions follow
+ * ------------------------------------------------------------
+ */
  
-                               case PGSTAT_MTYPE_ACTIVITY:
-                                       pgstat_recv_activity((PgStat_MsgActivity *) &msg, nread);
-                                       break;
  
-                               case PGSTAT_MTYPE_DROPDB:
-                                       pgstat_recv_dropdb((PgStat_MsgDropdb *) &msg, nread);
-                                       break;
+/* ----------
+ * pgstat_setheader() -
+ *
+ *             Set common header fields in a statistics message
+ * ----------
+ */
+static void
+pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype)
+{
+       hdr->m_type = mtype;
+}
  
-                               case PGSTAT_MTYPE_RESETCOUNTER:
-                                       pgstat_recv_resetcounter((PgStat_MsgResetcounter *) &msg,
-                                                                                        nread);
-                                       break;
  
-                               default:
-                                       break;
-                       }
+/* ----------
+ * pgstat_send() -
+ *
+ *             Send out one statistics message to the collector
+ * ----------
+ */
+static void
+pgstat_send(void *msg, int len)
+{
+       int                     rc;
  
-                       /*
-                        * Globally count messages.
-                        */
-                       pgStatNumMessages++;
+       if (pgStatSock < 0)
+               return;
  
-                       /*
-                        * If this is the first message after we wrote the stats file
-                        * the last time, setup the timeout that it'd be written.
-                        */
-                       if (!need_statwrite)
-                       {
-                               gettimeofday(&next_statwrite, NULL);
-                               next_statwrite.tv_usec += ((PGSTAT_STAT_INTERVAL) * 1000);
-                               next_statwrite.tv_sec += (next_statwrite.tv_usec / 1000000);
-                               next_statwrite.tv_usec %= 1000000;
-                               need_statwrite = TRUE;
-                       }
-               }
+       ((PgStat_MsgHdr *) msg)->m_size = len;
  
-               /*
-                * Note that we do NOT check for postmaster exit inside the loop;
-                * only EOF on the buffer pipe causes us to fall out.  This
-                * ensures we don't exit prematurely if there are still a few
-                * messages in the buffer or pipe at postmaster shutdown.
-                */
-       }
+       /* We'll retry after EINTR, but ignore all other failures */
+       do
+       {
+               rc = send(pgStatSock, msg, len, 0);
+       } while (rc < 0 && errno == EINTR);
+
+#ifdef USE_ASSERT_CHECKING
+       /* In debug builds, log send failures ... */
+       if (rc < 0)
+               elog(LOG, "could not send to statistics collector: %m");
+#endif
+}
+
+/* ----------
+ * pgstat_send_bgwriter() -
+ *
+ *      Send bgwriter statistics to the collector
+ * ----------
+ */
+void
+pgstat_send_bgwriter(void)
+{
+       /* We assume this initializes to zeroes */
+       static const PgStat_MsgBgWriter all_zeroes;
  
         /*
-        * Okay, we saw EOF on the buffer pipe, so there are no more messages
-        * to process.  If the buffer process quit because of postmaster
-        * shutdown, we want to save the final stats to reuse at next startup.
-        * But if the buffer process failed, it seems best not to (there may
-        * even now be a new collector firing up, and we don't want it to read
-        * a partially-rewritten stats file).
+        * This function can be called even if nothing at all has happened.
+        * In this case, avoid sending a completely empty message to
+        * the stats collector.
          */
-       if (!PostmasterIsAlive(false))
-               pgstat_write_statsfile();
+       if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
+               return;
+
+       /*
+        * Prepare and send the message
+        */
+       pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
+       pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
+
+       /*
+        * Clear out the statistics buffer, so it can be re-used.
+        */
+       MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
  }
  
  
  /* ----------
- * pgstat_recvbuffer() -
+ * PgstatCollectorMain() -
+ *
+ *     Start up the statistics collector process.      This is the body of the
+ *     postmaster child process.
   *
- *     This is the body of the separate buffering process. Its only
- *     purpose is to receive messages from the UDP socket as fast as
- *     possible and forward them over a pipe into the collector itself.
- *     If the collector is slow to absorb messages, they are buffered here.
+ *     The argc/argv parameters are valid only in EXEC_BACKEND case.
   * ----------
   */
-static void
-pgstat_recvbuffer(void)
+NON_EXEC_STATIC void
+PgstatCollectorMain(int argc, char *argv[])
  {
-       fd_set          rfds;
-       fd_set          wfds;
-       struct timeval timeout;
-       int                     writePipe = pgStatPipe[1];
-       int                     maxfd;
-       int                     nready;
+       struct itimerval write_timeout;
+       bool            need_timer = false;
         int                     len;
-       int                     xfr;
-       int                     frm;
-       PgStat_Msg      input_buffer;
-       char       *msgbuffer;
-       int                     msg_send = 0;   /* next send index in buffer */
-       int                     msg_recv = 0;   /* next receive index */
-       int                     msg_have = 0;   /* number of bytes stored */
-       bool            overflow = false;
+       PgStat_Msg      msg;
+
+#ifndef WIN32
+#ifdef HAVE_POLL
+       struct pollfd input_fd;
+#else
+       struct timeval sel_timeout;
+       fd_set          rfds;
+#endif
+#endif
+
+       IsUnderPostmaster = true;       /* we are a postmaster subprocess now */
+
+       MyProcPid = getpid();           /* reset MyProcPid */
  
         /*
-        * Identify myself via ps
+        * If possible, make this process a group leader, so that the postmaster
+        * can signal any child processes too.  (pgstat probably never has
+        * any child processes, but for consistency we make all postmaster
+        * child processes do this.)
          */
-       init_ps_display("stats buffer process", "", "");
-       set_ps_display("");
+#ifdef HAVE_SETSID
+       if (setsid() < 0)
+               elog(FATAL, "setsid() failed: %m");
+#endif
  
         /*
-        * We want to die if our child collector process does.  There are two
-        * ways we might notice that it has died: receive SIGCHLD, or get a
-        * write failure on the pipe leading to the child.      We can set SIGPIPE
-        * to kill us here.  Our SIGCHLD handler was already set up before we
-        * forked (must do it that way, else it's a race condition).
+        * Ignore all signals usually bound to some action in the postmaster,
+        * except SIGQUIT and SIGALRM.
          */
-       pqsignal(SIGPIPE, SIG_DFL);
+       pqsignal(SIGHUP, SIG_IGN);
+       pqsignal(SIGINT, SIG_IGN);
+       pqsignal(SIGTERM, SIG_IGN);
+       pqsignal(SIGQUIT, pgstat_exit);
+       pqsignal(SIGALRM, force_statwrite);
+       pqsignal(SIGPIPE, SIG_IGN);
+       pqsignal(SIGUSR1, SIG_IGN);
+       pqsignal(SIGUSR2, SIG_IGN);
+       pqsignal(SIGCHLD, SIG_DFL);
+       pqsignal(SIGTTIN, SIG_DFL);
+       pqsignal(SIGTTOU, SIG_DFL);
+       pqsignal(SIGCONT, SIG_DFL);
+       pqsignal(SIGWINCH, SIG_DFL);
         PG_SETMASK(&UnBlockSig);
  
         /*
-        * Set the write pipe to nonblock mode, so that we cannot block when
-        * the collector falls behind.
+        * Identify myself via ps
          */
-       if (!set_noblock(writePipe))
-       {
-               ereport(LOG,
-                               (errcode_for_socket_access(),
-                 errmsg("could not set statistics collector pipe to nonblocking mode: %m")));
-               exit(1);
-       }
+       init_ps_display("stats collector process", "", "", "");
  
         /*
-        * Allocate the message buffer
+        * Arrange to write the initial status file right away
          */
-       msgbuffer = (char *) malloc(PGSTAT_RECVBUFFERSZ);
-       if (msgbuffer == NULL)
-       {
-               ereport(LOG,
-                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                        errmsg("out of memory in statistics collector --- abort")));
-               exit(1);
-       }
+       need_statwrite = true;
+
+       /* Preset the delay between status file writes */
+       MemSet(&write_timeout, 0, sizeof(struct itimerval));
+       write_timeout.it_value.tv_sec = PGSTAT_STAT_INTERVAL / 1000;
+       write_timeout.it_value.tv_usec = (PGSTAT_STAT_INTERVAL % 1000) * 1000;
+
+       /*
+        * Read in an existing statistics stats file or initialize the stats to
+        * zero.
+        */
+       pgStatRunningInCollector = true;
+       pgStatDBHash = pgstat_read_statsfile(InvalidOid);
  
         /*
-        * Loop forever
+        * Setup the descriptor set for select(2).      Since only one bit in the set
+        * ever changes, we need not repeat FD_ZERO each time.
+        */
+#if !defined(HAVE_POLL) && !defined(WIN32)
+       FD_ZERO(&rfds);
+#endif
+
+       /*
+        * Loop to process messages until we get SIGQUIT or detect ungraceful
+        * death of our parent postmaster.
+        *
+        * For performance reasons, we don't want to do a PostmasterIsAlive() test
+        * after every message; instead, do it at statwrite time and if
+        * select()/poll() is interrupted by timeout.
          */
         for (;;)
         {
-               FD_ZERO(&rfds);
-               FD_ZERO(&wfds);
-               maxfd = -1;
+               int                     got_data;
  
                 /*
-                * As long as we have buffer space we add the socket to the read
-                * descriptor set.
+                * Quit if we get SIGQUIT from the postmaster.
                  */
-               if (msg_have <= (int) (PGSTAT_RECVBUFFERSZ - sizeof(PgStat_Msg)))
-               {
-                       FD_SET(pgStatSock, &rfds);
-                       maxfd = pgStatSock;
-                       overflow = false;
-               }
-               else
+               if (need_exit)
+                       break;
+
+               /*
+                * If time to write the stats file, do so.      Note that the alarm
+                * interrupt isn't re-enabled immediately, but only after we next
+                * receive a stats message; so no cycles are wasted when there is
+                * nothing going on.
+                */
+               if (need_statwrite)
                 {
-                       if (!overflow)
-                       {
-                               ereport(LOG,
-                                               (errmsg("statistics buffer is full")));
-                               overflow = true;
-                       }
+                       /* Check for postmaster death; if so we'll write file below */
+                       if (!PostmasterIsAlive(true))
+                               break;
+
+                       pgstat_write_statsfile();
+                       need_statwrite = false;
+                       need_timer = true;
                 }
  
                 /*
-                * If we have messages to write out, we add the pipe to the write
-                * descriptor set.
+                * Wait for a message to arrive; but not for more than
+                * PGSTAT_SELECT_TIMEOUT seconds. (This determines how quickly we will
+                * shut down after an ungraceful postmaster termination; so it needn't
+                * be very fast.  However, on some systems SIGQUIT won't interrupt the
+                * poll/select call, so this also limits speed of response to SIGQUIT,
+                * which is more important.)
+                *
+                * We use poll(2) if available, otherwise select(2).
+                * Win32 has its own implementation.
                  */
-               if (msg_have > 0)
+#ifndef WIN32
+#ifdef HAVE_POLL
+               input_fd.fd = pgStatSock;
+               input_fd.events = POLLIN | POLLERR;
+               input_fd.revents = 0;
+
+               if (poll(&input_fd, 1, PGSTAT_SELECT_TIMEOUT * 1000) < 0)
                 {
-                       FD_SET(writePipe, &wfds);
-                       if (writePipe > maxfd)
-                               maxfd = writePipe;
+                       if (errno == EINTR)
+                               continue;
+                       ereport(ERROR,
+                                       (errcode_for_socket_access(),
+                                        errmsg("poll() failed in statistics collector: %m")));
                 }
  
+               got_data = (input_fd.revents != 0);
+#else                                                  /* !HAVE_POLL */
+
+               FD_SET(pgStatSock, &rfds);
+
                 /*
-                * Wait for some work to do; but not for more than 10 seconds.
-                * (This determines how quickly we will shut down after an
-                * ungraceful postmaster termination; so it needn't be very fast.)
+                * timeout struct is modified by select() on some operating systems,
+                * so re-fill it each time.
                  */
-               timeout.tv_sec = 10;
-               timeout.tv_usec = 0;
+               sel_timeout.tv_sec = PGSTAT_SELECT_TIMEOUT;
+               sel_timeout.tv_usec = 0;
  
-               nready = select(maxfd + 1, &rfds, &wfds, NULL, &timeout);
-               if (nready < 0)
+               if (select(pgStatSock + 1, &rfds, NULL, NULL, &sel_timeout) < 0)
                 {
                         if (errno == EINTR)
                                 continue;
-                       ereport(LOG,
+                       ereport(ERROR,
                                         (errcode_for_socket_access(),
-                                        errmsg("select() failed in statistics buffer: %m")));
-                       exit(1);
+                                        errmsg("select() failed in statistics collector: %m")));
                 }
  
+               got_data = FD_ISSET(pgStatSock, &rfds);
+#endif   /* HAVE_POLL */
+#else /* WIN32 */
+               got_data = pgwin32_waitforsinglesocket(pgStatSock, FD_READ,
+                                                                                          PGSTAT_SELECT_TIMEOUT*1000);
+#endif
+
                 /*
                  * If there is a message on the socket, read it and check for
                  * validity.
                  */
-               if (FD_ISSET(pgStatSock, &rfds))
+               if (got_data)
                 {
-                       len = recv(pgStatSock, (char *) &input_buffer,
+                       len = recv(pgStatSock, (char *) &msg,
                                            sizeof(PgStat_Msg), 0);
                         if (len < 0)
                         {
-                               ereport(LOG,
+                               if (errno == EINTR)
+                                       continue;
+                               ereport(ERROR,
                                                 (errcode_for_socket_access(),
-                                          errmsg("could not read statistics message: %m")));
-                               exit(1);
+                                                errmsg("could not read statistics message: %m")));
                         }
  
                         /*
@@ -1942,291 +2322,150 @@ pgstat_recvbuffer(void)
                         /*
                          * The received length must match the length in the header
                          */
-                       if (input_buffer.msg_hdr.m_size != len)
+                       if (msg.msg_hdr.m_size != len)
                                 continue;
  
                         /*
-                        * O.K. - we accept this message.  Copy it to the circular
-                        * msgbuffer.
+                        * O.K. - we accept this message.  Process it.
                          */
-                       frm = 0;
-                       while (len > 0)
+                       switch (msg.msg_hdr.m_type)
                         {
-                               xfr = PGSTAT_RECVBUFFERSZ - msg_recv;
-                               if (xfr > len)
-                                       xfr = len;
-                               Assert(xfr > 0);
-                               memcpy(msgbuffer + msg_recv,
-                                          ((char *) &input_buffer) + frm,
-                                          xfr);
-                               msg_recv += xfr;
-                               if (msg_recv == PGSTAT_RECVBUFFERSZ)
-                                       msg_recv = 0;
-                               msg_have += xfr;
-                               frm += xfr;
-                               len -= xfr;
+                               case PGSTAT_MTYPE_DUMMY:
+                                       break;
+
+                               case PGSTAT_MTYPE_TABSTAT:
+                                       pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_TABPURGE:
+                                       pgstat_recv_tabpurge((PgStat_MsgTabpurge *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_DROPDB:
+                                       pgstat_recv_dropdb((PgStat_MsgDropdb *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_RESETCOUNTER:
+                                       pgstat_recv_resetcounter((PgStat_MsgResetcounter *) &msg,
+                                                                                        len);
+                                       break;
+
+                               case PGSTAT_MTYPE_AUTOVAC_START:
+                                       pgstat_recv_autovac((PgStat_MsgAutovacStart *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_VACUUM:
+                                       pgstat_recv_vacuum((PgStat_MsgVacuum *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_ANALYZE:
+                                       pgstat_recv_analyze((PgStat_MsgAnalyze *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_BGWRITER:
+                                       pgstat_recv_bgwriter((PgStat_MsgBgWriter *) &msg, len);
+                                       break;
+
+                               default:
+                                       break;
                         }
-               }
  
-               /*
-                * If the collector is ready to receive, write some data into his
-                * pipe.  We may or may not be able to write all that we have.
-                *
-                * NOTE: if what we have is less than PIPE_BUF bytes but more than
-                * the space available in the pipe buffer, most kernels will
-                * refuse to write any of it, and will return EAGAIN.  This means
-                * we will busy-loop until the situation changes (either because
-                * the collector caught up, or because more data arrives so that
-                * we have more than PIPE_BUF bytes buffered).  This is not good,
-                * but is there any way around it?      We have no way to tell when
-                * the collector has caught up...
-                */
-               if (FD_ISSET(writePipe, &wfds))
-               {
-                       xfr = PGSTAT_RECVBUFFERSZ - msg_send;
-                       if (xfr > msg_have)
-                               xfr = msg_have;
-                       Assert(xfr > 0);
-                       len = pipewrite(writePipe, msgbuffer + msg_send, xfr);
-                       if (len < 0)
+                       /*
+                        * If this is the first message after we wrote the stats file the
+                        * last time, enable the alarm interrupt to make it be written
+                        * again later.
+                        */
+                       if (need_timer)
                         {
-                               if (errno == EINTR || errno == EAGAIN)
-                                       continue;       /* not enough space in pipe */
-                               ereport(LOG,
-                                               (errcode_for_socket_access(),
-                                                errmsg("could not write to statistics collector pipe: %m")));
-                               exit(1);
+                               if (setitimer(ITIMER_REAL, &write_timeout, NULL))
+                                       ereport(ERROR,
+                                       (errmsg("could not set statistics collector timer: %m")));
+                               need_timer = false;
                         }
-                       /* NB: len < xfr is okay */
-                       msg_send += len;
-                       if (msg_send == PGSTAT_RECVBUFFERSZ)
-                               msg_send = 0;
-                       msg_have -= len;
                 }
+               else
+               {
+                       /*
+                        * We can only get here if the select/poll timeout elapsed. Check
+                        * for postmaster death.
+                        */
+                       if (!PostmasterIsAlive(true))
+                               break;
+               }
+       }                                                       /* end of message-processing loop */
  
-               /*
-                * Make sure we forwarded all messages before we check for
-                * postmaster termination.
-                */
-               if (msg_have != 0 || FD_ISSET(pgStatSock, &rfds))
-                       continue;
+       /*
+        * Save the final stats to reuse at next startup.
+        */
+       pgstat_write_statsfile();
  
-               /*
-                * If the postmaster has terminated, we die too.  (This is no longer
-                * the normal exit path, however.)
-                */
-               if (!PostmasterIsAlive(true))
-                       exit(0);
-       }
+       exit(0);
  }
  
-/* SIGQUIT signal handler for buffer process */
+
+/* SIGQUIT signal handler for collector process */
  static void
  pgstat_exit(SIGNAL_ARGS)
  {
-       /*
-        * For now, we just nail the doors shut and get out of town.  It might
-        * be cleaner to allow any pending messages to be sent, but that creates
-        * a tradeoff against speed of exit.
-        */
-       exit(0);
+       need_exit = true;
  }
  
-/* SIGCHLD signal handler for buffer process */
+/* SIGALRM signal handler for collector process */
  static void
-pgstat_die(SIGNAL_ARGS)
+force_statwrite(SIGNAL_ARGS)
  {
-       exit(1);
+       need_statwrite = true;
  }
  
  
-/* ----------
- * pgstat_add_backend() -
- *
- *     Support function to keep our backend list up to date.
- * ----------
+/*
+ * Lookup the hash table entry for the specified database. If no hash
+ * table entry exists, initialize it, if the create parameter is true.
+ * Else, return NULL.
   */
-static int
-pgstat_add_backend(PgStat_MsgHdr *msg)
+static PgStat_StatDBEntry *
+pgstat_get_db_entry(Oid databaseid, bool create)
  {
-       PgStat_StatDBEntry *dbentry;
-       PgStat_StatBeEntry *beentry;
-       PgStat_StatBeDead *deadbe;
+       PgStat_StatDBEntry *result;
         bool            found;
+       HASHACTION      action = (create ? HASH_ENTER : HASH_FIND);
  
-       /*
-        * Check that the backend ID is valid
-        */
-       if (msg->m_backendid < 1 || msg->m_backendid > MaxBackends)
-       {
-               ereport(LOG,
-                               (errmsg("invalid server process ID %d", msg->m_backendid)));
-               return -1;
-       }
+       /* Lookup or create the hash table entry for this database */
+       result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
+                                                                                               &databaseid,
+                                                                                               action, &found);
  
-       /*
-        * Get the slot for this backendid.
-        */
-       beentry = &pgStatBeTable[msg->m_backendid - 1];
-       if (beentry->databaseid != InvalidOid)
-       {
-               /*
-                * If the slot contains the PID of this backend, everything is
-                * fine and we got nothing to do.
-                */
-               if (beentry->procpid == msg->m_procpid)
-                       return 0;
-       }
-
-       /*
-        * Lookup if this backend is known to be dead. This can be caused due
-        * to messages arriving in the wrong order - i.e. Postmaster's BETERM
-        * message might have arrived before we received all the backends
-        * stats messages, or even a new backend with the same backendid was
-        * faster in sending his BESTART.
-        *
-        * If the backend is known to be dead, we ignore this add.
-        */
-       deadbe = (PgStat_StatBeDead *) hash_search(pgStatBeDead,
-                                                                                          (void *) &(msg->m_procpid),
-                                                                                          HASH_FIND, NULL);
-       if (deadbe)
-               return 1;
-
-       /*
-        * Backend isn't known to be dead. If it's slot is currently used, we
-        * have to kick out the old backend.
-        */
-       if (beentry->databaseid != InvalidOid)
-               pgstat_sub_backend(beentry->procpid);
-
-       /*
-        * Put this new backend into the slot.
-        */
-       beentry->databaseid = msg->m_databaseid;
-       beentry->procpid = msg->m_procpid;
-       beentry->userid = msg->m_userid;
-       beentry->activity_start_sec = 0;
-       beentry->activity_start_usec = 0;
-       MemSet(beentry->activity, 0, PGSTAT_ACTIVITY_SIZE);
-
-       /*
-        * Lookup or create the database entry for this backends DB.
-        */
-       dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                                                                  (void *) &(msg->m_databaseid),
-                                                                                                HASH_ENTER, &found);
-       if (dbentry == NULL)
-       {
-               ereport(LOG,
-                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                        errmsg("out of memory in statistics collector --- abort")));
-               exit(1);
-       }
+       if (!create && !found)
+               return NULL;
  
-       /*
-        * If not found, initialize the new one.
-        */
+       /* If not found, initialize the new one. */
         if (!found)
         {
                 HASHCTL         hash_ctl;
  
-               dbentry->tables = NULL;
-               dbentry->n_xact_commit = 0;
-               dbentry->n_xact_rollback = 0;
-               dbentry->n_blocks_fetched = 0;
-               dbentry->n_blocks_hit = 0;
-               dbentry->n_connects = 0;
-               dbentry->destroy = 0;
+               result->tables = NULL;
+               result->n_xact_commit = 0;
+               result->n_xact_rollback = 0;
+               result->n_blocks_fetched = 0;
+               result->n_blocks_hit = 0;
+               result->n_tuples_returned = 0;
+               result->n_tuples_fetched = 0;
+               result->n_tuples_inserted = 0;
+               result->n_tuples_updated = 0;
+               result->n_tuples_deleted = 0;
+               result->last_autovac_time = 0;
  
                 memset(&hash_ctl, 0, sizeof(hash_ctl));
                 hash_ctl.keysize = sizeof(Oid);
                 hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
-               hash_ctl.hash = tag_hash;
-               dbentry->tables = hash_create("Per-database table",
-                                                                         PGSTAT_TAB_HASH_SIZE,
-                                                                         &hash_ctl,
-                                                                         HASH_ELEM | HASH_FUNCTION);
-               if (dbentry->tables == NULL)
-               {
-                       /* assume the problem is out-of-memory */
-                       ereport(LOG,
-                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                        errmsg("out of memory in statistics collector --- abort")));
-                       exit(1);
-               }
-       }
-
-       /*
-        * Count number of connects to the database
-        */
-       dbentry->n_connects++;
-
-       return 0;
-}
-
-
-/* ----------
- * pgstat_sub_backend() -
- *
- *     Remove a backend from the actual backends list.
- * ----------
- */
-static void
-pgstat_sub_backend(int procpid)
-{
-       int                     i;
-       PgStat_StatBeDead *deadbe;
-       bool            found;
-
-       /*
-        * Search in the known-backends table for the slot containing this
-        * PID.
-        */
-       for (i = 0; i < MaxBackends; i++)
-       {
-               if (pgStatBeTable[i].databaseid != InvalidOid &&
-                       pgStatBeTable[i].procpid == procpid)
-               {
-                       /*
-                        * That's him. Add an entry to the known to be dead backends.
-                        * Due to possible misorder in the arrival of UDP packets it's
-                        * possible that even if we know the backend is dead, there
-                        * could still be messages queued that arrive later. Those
-                        * messages must not cause our number of backends statistics
-                        * to get screwed up, so we remember for a couple of seconds
-                        * that this PID is dead and ignore them (only the counting of
-                        * backends, not the table access stats they sent).
-                        */
-                       deadbe = (PgStat_StatBeDead *) hash_search(pgStatBeDead,
-                                                                                                          (void *) &procpid,
-                                                                                                          HASH_ENTER,
-                                                                                                          &found);
-                       if (deadbe == NULL)
-                       {
-                               ereport(LOG,
-                                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                                errmsg("out of memory in statistics collector --- abort")));
-                               exit(1);
-                       }
-                       if (!found)
-                       {
-                               deadbe->backendid = i + 1;
-                               deadbe->destroy = PGSTAT_DESTROY_COUNT;
-                       }
-
-                       /*
-                        * Declare the backend slot empty.
-                        */
-                       pgStatBeTable[i].databaseid = InvalidOid;
-                       return;
-               }
+               hash_ctl.hash = oid_hash;
+               result->tables = hash_create("Per-database table",
+                                                                        PGSTAT_TAB_HASH_SIZE,
+                                                                        &hash_ctl,
+                                                                        HASH_ELEM | HASH_FUNCTION);
         }
  
-       /*
-        * No big problem if not found. This can happen if UDP messages arrive
-        * out of order here.
-        */
+       return result;
  }
  
  
@@ -2243,23 +2482,33 @@ pgstat_write_statsfile(void)
         HASH_SEQ_STATUS tstat;
         PgStat_StatDBEntry *dbentry;
         PgStat_StatTabEntry *tabentry;
-       PgStat_StatBeDead *deadbe;
         FILE       *fpout;
-       int                     i;
+       int32           format_id;
  
         /*
          * Open the statistics temp file to write out the current values.
          */
-       fpout = fopen(pgStat_tmpfname, PG_BINARY_W);
+       fpout = fopen(PGSTAT_STAT_TMPFILE, PG_BINARY_W);
         if (fpout == NULL)
         {
                 ereport(LOG,
                                 (errcode_for_file_access(),
                                  errmsg("could not open temporary statistics file \"%s\": %m",
-                                               pgStat_tmpfname)));
+                                               PGSTAT_STAT_TMPFILE)));
                 return;
         }
  
+       /*
+        * Write the file header --- currently just a format ID.
+        */
+       format_id = PGSTAT_FILE_FORMAT_ID;
+       fwrite(&format_id, sizeof(format_id), 1, fpout);
+
+       /*
+        * Write global stats struct
+        */
+       fwrite(&globalStats, sizeof(globalStats), 1, fpout);
+
         /*
          * Walk through the database table.
          */
@@ -2267,72 +2516,19 @@ pgstat_write_statsfile(void)
         while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
         {
                 /*
-                * If this database is marked destroyed, count down and do so if
-                * it reaches 0.
-                */
-               if (dbentry->destroy > 0)
-               {
-                       if (--(dbentry->destroy) == 0)
-                       {
-                               if (dbentry->tables != NULL)
-                                       hash_destroy(dbentry->tables);
-
-                               if (hash_search(pgStatDBHash,
-                                                               (void *) &(dbentry->databaseid),
-                                                               HASH_REMOVE, NULL) == NULL)
-                               {
-                                       ereport(LOG,
-                                                       (errmsg("database hash table corrupted "
-                                                                       "during cleanup --- abort")));
-                                       exit(1);
-                               }
-                       }
-
-                       /*
-                        * Don't include statistics for it.
-                        */
-                       continue;
-               }
-
-               /*
-                * Write out the DB line including the number of life backends.
+                * Write out the DB entry including the number of live backends. We
+                * don't write the tables pointer since it's of no use to any other
+                * process.
                  */
                 fputc('D', fpout);
-               fwrite(dbentry, sizeof(PgStat_StatDBEntry), 1, fpout);
+               fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
  
                 /*
-                * Walk through the databases access stats per table.
+                * Walk through the database's access stats per table.
                  */
                 hash_seq_init(&tstat, dbentry->tables);
                 while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
                 {
-                       /*
-                        * If table entry marked for destruction, same as above for
-                        * the database entry.
-                        */
-                       if (tabentry->destroy > 0)
-                       {
-                               if (--(tabentry->destroy) == 0)
-                               {
-                                       if (hash_search(dbentry->tables,
-                                                                       (void *) &(tabentry->tableid),
-                                                                       HASH_REMOVE, NULL) == NULL)
-                                       {
-                                               ereport(LOG,
-                                                               (errmsg("tables hash table for "
-                                                                               "database %u corrupted during "
-                                                                               "cleanup --- abort",
-                                                                               dbentry->databaseid)));
-                                               exit(1);
-                                       }
-                               }
-                               continue;
-                       }
-
-                       /*
-                        * At least we think this is still a life table. Print it's
-                        * access stats.
-                        */
                         fputc('T', fpout);
                         fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
                 }
@@ -2343,67 +2539,37 @@ pgstat_write_statsfile(void)
                 fputc('d', fpout);
         }
  
-       /*
-        * Write out the known running backends to the stats file.
-        */
-       i = MaxBackends;
-       fputc('M', fpout);
-       fwrite(&i, sizeof(i), 1, fpout);
-
-       for (i = 0; i < MaxBackends; i++)
-       {
-               if (pgStatBeTable[i].databaseid != InvalidOid)
-               {
-                       fputc('B', fpout);
-                       fwrite(&pgStatBeTable[i], sizeof(PgStat_StatBeEntry), 1, fpout);
-               }
-       }
-
         /*
          * No more output to be done. Close the temp file and replace the old
-        * pgstat.stat with it.
+        * pgstat.stat with it.  The ferror() check replaces testing for error
+        * after each individual fputc or fwrite above.
          */
         fputc('E', fpout);
-       if (fclose(fpout) < 0)
+
+       if (ferror(fpout))
         {
                 ereport(LOG,
                                 (errcode_for_file_access(),
-                                errmsg("could not close temporary statistics file \"%s\": %m",
-                                               pgStat_tmpfname)));
+                          errmsg("could not write temporary statistics file \"%s\": %m",
+                                         PGSTAT_STAT_TMPFILE)));
+               fclose(fpout);
+               unlink(PGSTAT_STAT_TMPFILE);
         }
-       else
+       else if (fclose(fpout) < 0)
         {
-               if (rename(pgStat_tmpfname, pgStat_fname) < 0)
-               {
-                       ereport(LOG,
-                                       (errcode_for_file_access(),
-                                        errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
-                                                       pgStat_tmpfname, pgStat_fname)));
-               }
+               ereport(LOG,
+                               (errcode_for_file_access(),
+                          errmsg("could not close temporary statistics file \"%s\": %m",
+                                         PGSTAT_STAT_TMPFILE)));
+               unlink(PGSTAT_STAT_TMPFILE);
         }
-
-       /*
-        * Clear out the dead backends table
-        */
-       hash_seq_init(&hstat, pgStatBeDead);
-       while ((deadbe = (PgStat_StatBeDead *) hash_seq_search(&hstat)) != NULL)
+       else if (rename(PGSTAT_STAT_TMPFILE, PGSTAT_STAT_FILENAME) < 0)
         {
-               /*
-                * Count down the destroy delay and remove entries where it
-                * reaches 0.
-                */
-               if (--(deadbe->destroy) <= 0)
-               {
-                       if (hash_search(pgStatBeDead,
-                                                       (void *) &(deadbe->procpid),
-                                                       HASH_REMOVE, NULL) == NULL)
-                       {
-                               ereport(LOG,
-                                               (errmsg("dead-server-process hash table corrupted "
-                                                               "during cleanup --- abort")));
-                               exit(1);
-                       }
-               }
+               ereport(LOG,
+                               (errcode_for_file_access(),
+                                errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
+                                               PGSTAT_STAT_TMPFILE, PGSTAT_STAT_FILENAME)));
+               unlink(PGSTAT_STAT_TMPFILE);
         }
  }
  
@@ -2411,44 +2577,28 @@ pgstat_write_statsfile(void)
  /* ----------
   * pgstat_read_statsfile() -
   *
- *     Reads in an existing statistics collector and initializes the
- *     databases hash table (who's entries point to the tables hash tables)
- *     and the current backend table.
+ *     Reads in an existing statistics collector file and initializes the
+ *     databases' hash table (whose entries point to the tables' hash tables).
   * ----------
   */
-static void
-pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
-                                         PgStat_StatBeEntry **betab, int *numbackends)
+static HTAB *
+pgstat_read_statsfile(Oid onlydb)
  {
         PgStat_StatDBEntry *dbentry;
         PgStat_StatDBEntry dbbuf;
         PgStat_StatTabEntry *tabentry;
         PgStat_StatTabEntry tabbuf;
         HASHCTL         hash_ctl;
+       HTAB       *dbhash;
         HTAB       *tabhash = NULL;
         FILE       *fpin;
-       int                     maxbackends = 0;
-       int                     havebackends = 0;
+       int32           format_id;
         bool            found;
-       MemoryContext use_mcxt;
-       int                     mcxt_flags;
  
         /*
-        * If running in the collector we use the DynaHashCxt memory context.
-        * If running in a backend, we use the TopTransactionContext instead,
-        * so the caller must only know the last XactId when this call
-        * happened to know if his tables are still valid or already gone!
+        * The tables will live in pgStatLocalContext.
          */
-       if (pgStatRunningInCollector)
-       {
-               use_mcxt = NULL;
-               mcxt_flags = 0;
-       }
-       else
-       {
-               use_mcxt = TopTransactionContext;
-               mcxt_flags = HASH_CONTEXT;
-       }
+       pgstat_setup_memcxt();
  
         /*
          * Create the DB hashtable
@@ -2456,54 +2606,45 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
         memset(&hash_ctl, 0, sizeof(hash_ctl));
         hash_ctl.keysize = sizeof(Oid);
         hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
-       hash_ctl.hash = tag_hash;
-       hash_ctl.hcxt = use_mcxt;
-       *dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
-                                                 HASH_ELEM | HASH_FUNCTION | mcxt_flags);
-       if (*dbhash == NULL)
-       {
-               /* assume the problem is out-of-memory */
-               if (pgStatRunningInCollector)
-               {
-                       ereport(LOG,
-                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                        errmsg("out of memory in statistics collector --- abort")));
-                       exit(1);
-               }
-               /* in backend, can do normal error */
-               ereport(ERROR,
-                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                errmsg("out of memory")));
-       }
+       hash_ctl.hash = oid_hash;
+       hash_ctl.hcxt = pgStatLocalContext;
+       dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
+                                                HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
  
         /*
-        * Initialize the number of known backends to zero, just in case we do
-        * a silent error return below.
+        * Clear out global statistics so they start from zero in case we can't
+        * load an existing statsfile.
          */
-       if (numbackends != NULL)
-               *numbackends = 0;
-       if (betab != NULL)
-               *betab = NULL;
+       memset(&globalStats, 0, sizeof(globalStats));
  
         /*
-        * In EXEC_BACKEND case, we won't have inherited pgStat_fname from
-        * postmaster, so compute it first time through.
+        * Try to open the status file. If it doesn't exist, the backends simply
+        * return zero for anything and the collector simply starts from scratch
+        * with empty counters.
          */
-#ifdef EXEC_BACKEND
-       if (pgStat_fname[0] == '\0')
+       if ((fpin = AllocateFile(PGSTAT_STAT_FILENAME, PG_BINARY_R)) == NULL)
+               return dbhash;
+
+       /*
+        * Verify it's of the expected format.
+        */
+       if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id)
+               || format_id != PGSTAT_FILE_FORMAT_ID)
         {
-               Assert(DataDir != NULL);
-               snprintf(pgStat_fname, MAXPGPATH, PGSTAT_STAT_FILENAME, DataDir);
+               ereport(pgStatRunningInCollector ? LOG : WARNING,
+                               (errmsg("corrupted pgstat.stat file")));
+               goto done;
         }
-#endif
  
         /*
-        * Try to open the status file. If it doesn't exist, the backends
-        * simply return zero for anything and the collector simply starts
-        * from scratch with empty counters.
+        * Read global stats struct
          */
-       if ((fpin = fopen(pgStat_fname, PG_BINARY_R)) == NULL)
-               return;
+       if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
+       {
+               ereport(pgStatRunningInCollector ? LOG : WARNING,
+                               (errmsg("corrupted pgstat.stat file")));
+               goto done;
+       }
  
         /*
          * We found an existing collector stats file. Read it and put all the
@@ -2515,90 +2656,59 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
                 {
                                 /*
                                  * 'D'  A PgStat_StatDBEntry struct describing a database
-                                * follows. Subsequently, zero to many 'T' entries will
-                                * follow until a 'd' is encountered.
+                                * follows. Subsequently, zero to many 'T' entries will follow
+                                * until a 'd' is encountered.
                                  */
                         case 'D':
-                               if (fread(&dbbuf, 1, sizeof(dbbuf), fpin) != sizeof(dbbuf))
+                               if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables),
+                                                 fpin) != offsetof(PgStat_StatDBEntry, tables))
                                 {
                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
                                                         (errmsg("corrupted pgstat.stat file")));
-                                       fclose(fpin);
-                                       return;
+                                       goto done;
                                 }
  
                                 /*
                                  * Add to the DB hash
                                  */
-                               dbentry = (PgStat_StatDBEntry *) hash_search(*dbhash,
-                                                                                         (void *) &dbbuf.databaseid,
+                               dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
+                                                                                                 (void *) &dbbuf.databaseid,
                                                                                                                          HASH_ENTER,
                                                                                                                          &found);
-                               if (dbentry == NULL)
-                               {
-                                       if (pgStatRunningInCollector)
-                                       {
-                                               ereport(LOG,
-                                                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                                                errmsg("out of memory in statistics collector --- abort")));
-                                               exit(1);
-                                       }
-                                       else
-                                       {
-                                               fclose(fpin);
-                                               ereport(ERROR,
-                                                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                                                errmsg("out of memory")));
-                                       }
-                               }
                                 if (found)
                                 {
                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
                                                         (errmsg("corrupted pgstat.stat file")));
-                                       fclose(fpin);
-                                       return;
+                                       goto done;
                                 }
  
                                 memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
                                 dbentry->tables = NULL;
-                               dbentry->destroy = 0;
-                               dbentry->n_backends = 0;
  
                                 /*
-                                * Don't collect tables if not the requested DB
+                                * Don't collect tables if not the requested DB (or the
+                                * shared-table info)
                                  */
-                               if (onlydb != InvalidOid && onlydb != dbbuf.databaseid)
-                                       break;
+                               if (onlydb != InvalidOid)
+                               {
+                                       if (dbbuf.databaseid != onlydb &&
+                                               dbbuf.databaseid != InvalidOid)
+                                               break;
+                               }
  
                                 memset(&hash_ctl, 0, sizeof(hash_ctl));
                                 hash_ctl.keysize = sizeof(Oid);
                                 hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
-                               hash_ctl.hash = tag_hash;
-                               hash_ctl.hcxt = use_mcxt;
+                               hash_ctl.hash = oid_hash;
+                               hash_ctl.hcxt = pgStatLocalContext;
                                 dbentry->tables = hash_create("Per-database table",
                                                                                           PGSTAT_TAB_HASH_SIZE,
                                                                                           &hash_ctl,
-                                                                HASH_ELEM | HASH_FUNCTION | mcxt_flags);
-                               if (dbentry->tables == NULL)
-                               {
-                                       /* assume the problem is out-of-memory */
-                                       if (pgStatRunningInCollector)
-                                       {
-                                               ereport(LOG,
-                                                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                                                errmsg("out of memory in statistics collector --- abort")));
-                                               exit(1);
-                                       }
-                                       /* in backend, can do normal error */
-                                       fclose(fpin);
-                                       ereport(ERROR,
-                                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                                                        errmsg("out of memory")));
-                               }
+                                                                        HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
  
                                 /*
-                                * Arrange that following 'T's add entries to this
-                                * databases tables hash table.
+                                * Arrange that following 'T's add entries to this database's
+                                * tables hash table.
                                  */
                                 tabhash = dbentry->tables;
                                 break;
@@ -2614,12 +2724,12 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
                                  * 'T'  A PgStat_StatTabEntry follows.
                                  */
                         case 'T':
-                               if (fread(&tabbuf, 1, sizeof(tabbuf), fpin) != sizeof(tabbuf))
+                               if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry),
+                                                 fpin) != sizeof(PgStat_StatTabEntry))
                                 {
                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
                                                         (errmsg("corrupted pgstat.stat file")));
-                                       fclose(fpin);
-                                       return;
+                                       goto done;
                                 }
  
                                 /*
@@ -2629,188 +2739,99 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
                                         break;
  
                                 tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
-                                                                                               (void *) &tabbuf.tableid,
-                                                                                                        HASH_ENTER, &found);
-                               if (tabentry == NULL)
-                               {
-                                       if (pgStatRunningInCollector)
-                                       {
-                                               ereport(LOG,
-                                                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                                                                errmsg("out of memory in statistics collector --- abort")));
-                                               exit(1);
-                                       }
-                                       /* in backend, can do normal error */
-                                       fclose(fpin);
-                                       ereport(ERROR,
-                                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                                                        errmsg("out of memory")));
-                               }
+                                                                                                       (void *) &tabbuf.tableid,
+                                                                                                                HASH_ENTER, &found);
  
                                 if (found)
                                 {
                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
                                                         (errmsg("corrupted pgstat.stat file")));
-                                       fclose(fpin);
-                                       return;
+                                       goto done;
                                 }
  
                                 memcpy(tabentry, &tabbuf, sizeof(tabbuf));
                                 break;
  
-                               /*
-                                * 'M'  The maximum number of backends to expect follows.
-                                */
-                       case 'M':
-                               if (betab == NULL || numbackends == NULL)
-                               {
-                                       fclose(fpin);
-                                       return;
-                               }
-                               if (fread(&maxbackends, 1, sizeof(maxbackends), fpin) !=
-                                       sizeof(maxbackends))
-                               {
-                                       ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                                       (errmsg("corrupted pgstat.stat file")));
-                                       fclose(fpin);
-                                       return;
-                               }
-                               if (maxbackends == 0)
-                               {
-                                       fclose(fpin);
-                                       return;
-                               }
-
-                               /*
-                                * Allocate space (in TopTransactionContext too) for the
-                                * backend table.
-                                */
-                               if (use_mcxt == NULL)
-                                       *betab = (PgStat_StatBeEntry *) malloc(
-                                                          sizeof(PgStat_StatBeEntry) * maxbackends);
-                               else
-                                       *betab = (PgStat_StatBeEntry *) MemoryContextAlloc(
-                                                                                                                               use_mcxt,
-                                                          sizeof(PgStat_StatBeEntry) * maxbackends);
-                               break;
-
-                               /*
-                                * 'B'  A PgStat_StatBeEntry follows.
-                                */
-                       case 'B':
-                               if (betab == NULL || numbackends == NULL)
-                               {
-                                       fclose(fpin);
-                                       return;
-                               }
-                               if (*betab == NULL)
-                               {
-                                       fclose(fpin);
-                                       return;
-                               }
-
-                               /*
-                                * Read it directly into the table.
-                                */
-                               if (fread(&(*betab)[havebackends], 1,
-                                                 sizeof(PgStat_StatBeEntry), fpin) !=
-                                       sizeof(PgStat_StatBeEntry))
-                               {
-                                       ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                                       (errmsg("corrupted pgstat.stat file")));
-                                       fclose(fpin);
-                                       return;
-                               }
-
-                               /*
-                                * Count backends per database here.
-                                */
-                               dbentry = (PgStat_StatDBEntry *) hash_search(*dbhash,
-                                                  (void *) &((*betab)[havebackends].databaseid),
-                                                                                                               HASH_FIND, NULL);
-                               if (dbentry)
-                                       dbentry->n_backends++;
-
-                               havebackends++;
-                               if (numbackends != 0)
-                                       *numbackends = havebackends;
-                               if (havebackends >= maxbackends)
-                               {
-                                       fclose(fpin);
-                                       return;
-                               }
-                               break;
-
                                 /*
                                  * 'E'  The EOF marker of a complete stats file.
                                  */
                         case 'E':
-                               fclose(fpin);
-                               return;
+                               goto done;
  
                         default:
                                 ereport(pgStatRunningInCollector ? LOG : WARNING,
                                                 (errmsg("corrupted pgstat.stat file")));
-                               fclose(fpin);
-                               return;
+                               goto done;
                 }
         }
  
-       fclose(fpin);
-}
+done:
+       FreeFile(fpin);
  
+       return dbhash;
+}
  
-/* ----------
- * pgstat_recv_bestart() -
- *
- *     Process a backend starup message.
- * ----------
+/*
+ * If not already done, read the statistics collector stats file into
+ * some hash tables.  The results will be kept until pgstat_clear_snapshot()
+ * is called (typically, at end of transaction).
   */
  static void
-pgstat_recv_bestart(PgStat_MsgBestart *msg, int len)
+backend_read_statsfile(void)
  {
-       pgstat_add_backend(&msg->m_hdr);
+       /* already read it? */
+       if (pgStatDBHash)
+               return;
+       Assert(!pgStatRunningInCollector);
+
+       /* Autovacuum launcher wants stats about all databases */
+       if (IsAutoVacuumLauncherProcess())
+               pgStatDBHash = pgstat_read_statsfile(InvalidOid);
+       else
+               pgStatDBHash = pgstat_read_statsfile(MyDatabaseId);
  }
  
  
  /* ----------
- * pgstat_recv_beterm() -
+ * pgstat_setup_memcxt() -
   *
- *     Process a backend termination message.
+ *     Create pgStatLocalContext, if not already done.
   * ----------
   */
  static void
-pgstat_recv_beterm(PgStat_MsgBeterm *msg, int len)
+pgstat_setup_memcxt(void)
  {
-       pgstat_sub_backend(msg->m_hdr.m_procpid);
+       if (!pgStatLocalContext)
+               pgStatLocalContext = AllocSetContextCreate(TopMemoryContext,
+                                                                                                  "Statistics snapshot",
+                                                                                                  ALLOCSET_SMALL_MINSIZE,
+                                                                                                  ALLOCSET_SMALL_INITSIZE,
+                                                                                                  ALLOCSET_SMALL_MAXSIZE);
  }
  
  
  /* ----------
- * pgstat_recv_activity() -
+ * pgstat_clear_snapshot() -
   *
- *     Remember what the backend is doing.
+ *     Discard any data collected in the current transaction.  Any subsequent
+ *     request will cause new snapshots to be read.
+ *
+ *     This is also invoked during transaction commit or abort to discard
+ *     the no-longer-wanted snapshot.
   * ----------
   */
-static void
-pgstat_recv_activity(PgStat_MsgActivity *msg, int len)
+void
+pgstat_clear_snapshot(void)
  {
-       PgStat_StatBeEntry *entry;
-
-       /*
-        * Here we check explicitly for 0 return, since we don't want to
-        * mangle the activity of an active backend by a delayed packet from a
-        * dead one.
-        */
-       if (pgstat_add_backend(&msg->m_hdr) != 0)
-               return;
-
-       entry = &(pgStatBeTable[msg->m_hdr.m_backendid - 1]);
-
-       StrNCpy(entry->activity, msg->m_what, PGSTAT_ACTIVITY_SIZE);
-
-       entry->activity_start_sec =
-               GetCurrentAbsoluteTimeUsec(&entry->activity_start_usec);
+       /* Release memory, if any was allocated */
+       if (pgStatLocalContext)
+               MemoryContextDelete(pgStatLocalContext);
+
+       /* Reset variables */
+       pgStatLocalContext = NULL;
+       pgStatDBHash = NULL;
+       localBackendStatusTable = NULL;
+       localNumBackends = 0;
  }
  
  
@@ -2829,28 +2850,11 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
         int                     i;
         bool            found;
  
-       /*
-        * Make sure the backend is counted for.
-        */
-       if (pgstat_add_backend(&msg->m_hdr) < 0)
-               return;
-
-       /*
-        * Lookup the database in the hashtable.
-        */
-       dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                                                        (void *) &(msg->m_hdr.m_databaseid),
-                                                                                                HASH_FIND, NULL);
-       if (!dbentry)
-               return;
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
  
         /*
-        * If the database is marked for destroy, this is a delayed UDP packet
-        * and not worth being counted.
+        * Update database-wide stats.
          */
-       if (dbentry->destroy > 0)
-               return;
-
         dbentry->n_xact_commit += (PgStat_Counter) (msg->m_xact_commit);
         dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback);
  
@@ -2860,53 +2864,59 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
         for (i = 0; i < msg->m_nentries; i++)
         {
                 tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-                                                                                         (void *) &(tabmsg[i].t_id),
-                                                                                                        HASH_ENTER, &found);
-               if (tabentry == NULL)
-               {
-                       ereport(LOG,
-                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                        errmsg("out of memory in statistics collector --- abort")));
-                       exit(1);
-               }
+                                                                                                 (void *) &(tabmsg[i].t_id),
+                                                                                                          HASH_ENTER, &found);
  
                 if (!found)
                 {
                         /*
-                        * If it's a new table entry, initialize counters to the
-                        * values we just got.
+                        * If it's a new table entry, initialize counters to the values we
+                        * just got.
                          */
-                       tabentry->numscans = tabmsg[i].t_numscans;
-                       tabentry->tuples_returned = tabmsg[i].t_tuples_returned;
-                       tabentry->tuples_fetched = tabmsg[i].t_tuples_fetched;
-                       tabentry->tuples_inserted = tabmsg[i].t_tuples_inserted;
-                       tabentry->tuples_updated = tabmsg[i].t_tuples_updated;
-                       tabentry->tuples_deleted = tabmsg[i].t_tuples_deleted;
-                       tabentry->blocks_fetched = tabmsg[i].t_blocks_fetched;
-                       tabentry->blocks_hit = tabmsg[i].t_blocks_hit;
-
-                       tabentry->destroy = 0;
+                       tabentry->numscans = tabmsg[i].t_counts.t_numscans;
+                       tabentry->tuples_returned = tabmsg[i].t_counts.t_tuples_returned;
+                       tabentry->tuples_fetched = tabmsg[i].t_counts.t_tuples_fetched;
+                       tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted;
+                       tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated;
+                       tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted;
+                       tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples;
+                       tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples;
+                       tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched;
+                       tabentry->blocks_hit = tabmsg[i].t_counts.t_blocks_hit;
+
+                       tabentry->last_anl_tuples = 0;
+                       tabentry->vacuum_timestamp = 0;
+                       tabentry->autovac_vacuum_timestamp = 0;
+                       tabentry->analyze_timestamp = 0;
+                       tabentry->autovac_analyze_timestamp = 0;
                 }
                 else
                 {
                         /*
                          * Otherwise add the values to the existing entry.
                          */
-                       tabentry->numscans += tabmsg[i].t_numscans;
-                       tabentry->tuples_returned += tabmsg[i].t_tuples_returned;
-                       tabentry->tuples_fetched += tabmsg[i].t_tuples_fetched;
-                       tabentry->tuples_inserted += tabmsg[i].t_tuples_inserted;
-                       tabentry->tuples_updated += tabmsg[i].t_tuples_updated;
-                       tabentry->tuples_deleted += tabmsg[i].t_tuples_deleted;
-                       tabentry->blocks_fetched += tabmsg[i].t_blocks_fetched;
-                       tabentry->blocks_hit += tabmsg[i].t_blocks_hit;
+                       tabentry->numscans += tabmsg[i].t_counts.t_numscans;
+                       tabentry->tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
+                       tabentry->tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
+                       tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
+                       tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
+                       tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+                       tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples;
+                       tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples;
+                       tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
+                       tabentry->blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
                 }
  
                 /*
-                * And add the block IO to the database entry.
+                * Add per-table stats to the per-database entry, too.
                  */
-               dbentry->n_blocks_fetched += tabmsg[i].t_blocks_fetched;
-               dbentry->n_blocks_hit += tabmsg[i].t_blocks_hit;
+               dbentry->n_tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
+               dbentry->n_tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
+               dbentry->n_tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
+               dbentry->n_tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
+               dbentry->n_tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+               dbentry->n_blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
+               dbentry->n_blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
         }
  }
  
@@ -2921,29 +2931,14 @@ static void
  pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
  {
         PgStat_StatDBEntry *dbentry;
-       PgStat_StatTabEntry *tabentry;
         int                     i;
  
-       /*
-        * Make sure the backend is counted for.
-        */
-       if (pgstat_add_backend(&msg->m_hdr) < 0)
-               return;
-
-       /*
-        * Lookup the database in the hashtable.
-        */
-       dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                                                        (void *) &(msg->m_hdr.m_databaseid),
-                                                                                                HASH_FIND, NULL);
-       if (!dbentry)
-               return;
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
  
         /*
-        * If the database is marked for destroy, this is a delayed UDP packet
-        * and the tables will go away at DB destruction.
+        * No need to purge if we don't even know the database.
          */
-       if (dbentry->destroy > 0)
+       if (!dbentry || !dbentry->tables)
                 return;
  
         /*
@@ -2951,11 +2946,10 @@ pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
          */
         for (i = 0; i < msg->m_nentries; i++)
         {
-               tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-                                                                                  (void *) &(msg->m_tableid[i]),
-                                                                                                          HASH_FIND, NULL);
-               if (tabentry)
-                       tabentry->destroy = PGSTAT_DESTROY_COUNT;
+               /* Remove from hashtable if present; we don't care if it's not. */
+               (void) hash_search(dbentry->tables,
+                                                  (void *) &(msg->m_tableid[i]),
+                                                  HASH_REMOVE, NULL);
         }
  }
  
@@ -2971,32 +2965,33 @@ pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
  {
         PgStat_StatDBEntry *dbentry;
  
-       /*
-        * Make sure the backend is counted for.
-        */
-       if (pgstat_add_backend(&msg->m_hdr) < 0)
-               return;
-
         /*
          * Lookup the database in the hashtable.
          */
-       dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                                                                  (void *) &(msg->m_databaseid),
-                                                                                                HASH_FIND, NULL);
-       if (!dbentry)
-               return;
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
  
         /*
-        * Mark the database for destruction.
+        * If found, remove it.
          */
-       dbentry->destroy = PGSTAT_DESTROY_COUNT;
+       if (dbentry)
+       {
+               if (dbentry->tables != NULL)
+                       hash_destroy(dbentry->tables);
+
+               if (hash_search(pgStatDBHash,
+                                               (void *) &(dbentry->databaseid),
+                                               HASH_REMOVE, NULL) == NULL)
+                       ereport(ERROR,
+                                       (errmsg("database hash table corrupted "
+                                                       "during cleanup --- abort")));
+       }
  }
  
  
  /* ----------
- * pgstat_recv_dropdb() -
+ * pgstat_recv_resetcounter() -
   *
- *     Arrange for dead database removal
+ *     Reset the statistics for the specified database.
   * ----------
   */
  static void
@@ -3006,23 +3001,16 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
         PgStat_StatDBEntry *dbentry;
  
         /*
-        * Make sure the backend is counted for.
+        * Lookup the database in the hashtable.  Nothing to do if not there.
          */
-       if (pgstat_add_backend(&msg->m_hdr) < 0)
-               return;
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
  
-       /*
-        * Lookup the database in the hashtable.
-        */
-       dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                                                        (void *) &(msg->m_hdr.m_databaseid),
-                                                                                                HASH_FIND, NULL);
         if (!dbentry)
                 return;
  
         /*
-        * We simply throw away all the databases table entries by recreating
-        * a new hash table for them.
+        * We simply throw away all the database's table entries by recreating a
+        * new hash table for them.
          */
         if (dbentry->tables != NULL)
                 hash_destroy(dbentry->tables);
@@ -3032,23 +3020,143 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
         dbentry->n_xact_rollback = 0;
         dbentry->n_blocks_fetched = 0;
         dbentry->n_blocks_hit = 0;
-       dbentry->n_connects = 0;
-       dbentry->destroy = 0;
  
         memset(&hash_ctl, 0, sizeof(hash_ctl));
         hash_ctl.keysize = sizeof(Oid);
         hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
-       hash_ctl.hash = tag_hash;
+       hash_ctl.hash = oid_hash;
         dbentry->tables = hash_create("Per-database table",
                                                                   PGSTAT_TAB_HASH_SIZE,
                                                                   &hash_ctl,
                                                                   HASH_ELEM | HASH_FUNCTION);
-       if (dbentry->tables == NULL)
+}
+
+/* ----------
+ * pgstat_recv_autovac() -
+ *
+ *     Process an autovacuum signalling message.
+ * ----------
+ */
+static void
+pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
+{
+       PgStat_StatDBEntry *dbentry;
+
+       /*
+        * Lookup the database in the hashtable.  Don't create the entry if it
+        * doesn't exist, because autovacuum may be processing a template
+        * database.  If this isn't the case, the database is most likely to have
+        * an entry already.  (If it doesn't, not much harm is done anyway --
+        * it'll get created as soon as somebody actually uses the database.)
+        */
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
+       if (dbentry == NULL)
+               return;
+
+       /*
+        * Store the last autovacuum time in the database entry.
+        */
+       dbentry->last_autovac_time = msg->m_start_time;
+}
+
+/* ----------
+ * pgstat_recv_vacuum() -
+ *
+ *     Process a VACUUM message.
+ * ----------
+ */
+static void
+pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
+{
+       PgStat_StatDBEntry *dbentry;
+       PgStat_StatTabEntry *tabentry;
+
+       /*
+        * Don't create either the database or table entry if it doesn't already
+        * exist.  This avoids bloating the stats with entries for stuff that is
+        * only touched by vacuum and not by live operations.
+        */
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
+       if (dbentry == NULL)
+               return;
+
+       tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
+                                                  HASH_FIND, NULL);
+       if (tabentry == NULL)
+               return;
+
+       if (msg->m_autovacuum)
+               tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime;
+       else
+               tabentry->vacuum_timestamp = msg->m_vacuumtime;
+       tabentry->n_live_tuples = msg->m_tuples;
+       tabentry->n_dead_tuples = 0;
+       if (msg->m_analyze)
         {
-               /* assume the problem is out-of-memory */
-               ereport(LOG,
-                               (errcode(ERRCODE_OUT_OF_MEMORY),
-                        errmsg("out of memory in statistics collector --- abort")));
-               exit(1);
+               tabentry->last_anl_tuples = msg->m_tuples;
+               if (msg->m_autovacuum)
+                       tabentry->autovac_analyze_timestamp = msg->m_vacuumtime;
+               else
+                       tabentry->analyze_timestamp = msg->m_vacuumtime;
         }
+       else
+       {
+               /* last_anl_tuples must never exceed n_live_tuples */
+               tabentry->last_anl_tuples = Min(tabentry->last_anl_tuples,
+                                                                               msg->m_tuples);
+       }
+}
+
+/* ----------
+ * pgstat_recv_analyze() -
+ *
+ *     Process an ANALYZE message.
+ * ----------
+ */
+static void
+pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
+{
+       PgStat_StatDBEntry *dbentry;
+       PgStat_StatTabEntry *tabentry;
+
+       /*
+        * Don't create either the database or table entry if it doesn't already
+        * exist.  This avoids bloating the stats with entries for stuff that is
+        * only touched by analyze and not by live operations.
+        */
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
+       if (dbentry == NULL)
+               return;
+
+       tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
+                                                  HASH_FIND, NULL);
+       if (tabentry == NULL)
+               return;
+
+       if (msg->m_autovacuum)
+               tabentry->autovac_analyze_timestamp = msg->m_analyzetime;
+       else
+               tabentry->analyze_timestamp = msg->m_analyzetime;
+       tabentry->n_live_tuples = msg->m_live_tuples;
+       tabentry->n_dead_tuples = msg->m_dead_tuples;
+       tabentry->last_anl_tuples = msg->m_live_tuples + msg->m_dead_tuples;
+}
+
+
+/* ----------
+ * pgstat_recv_bgwriter() -
+ *
+ *     Process a BGWRITER message.
+ * ----------
+ */
+static void
+pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
+{
+       globalStats.timed_checkpoints += msg->m_timed_checkpoints;
+       globalStats.requested_checkpoints += msg->m_requested_checkpoints;
+       globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
+       globalStats.buf_written_lru += msg->m_buf_written_lru;
+       globalStats.buf_written_all += msg->m_buf_written_all;
+       globalStats.maxwritten_lru += msg->m_maxwritten_lru;
+       globalStats.maxwritten_all += msg->m_maxwritten_all;
  }