]> granicus.if.org Git - postgresql/blobdiff - src/backend/postmaster/pgstat.c
Fix up pgstats counting of live and dead tuples to recognize that committed
[postgresql] / src / backend / postmaster / pgstat.c
index 50486f8cef2a44e71955c32c00896b54e2c088af..b41a16de44ce86435068597a40e0fa3537ccd08b 100644 (file)
@@ -11,9 +11,9 @@
  *                     - Add a pgstat config column to pg_database, so this
  *                       entire thing can be enabled/disabled on a per db basis.
  *
- *     Copyright (c) 2001-2006, PostgreSQL Global Development Group
+ *     Copyright (c) 2001-2007, PostgreSQL Global Development Group
  *
- *     $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.128 2006/06/18 15:38:37 petere Exp $
+ *     $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.156 2007/05/27 03:50:39 tgl Exp $
  * ----------
  */
 #include "postgres.h"
 #include <arpa/inet.h>
 #include <signal.h>
 #include <time.h>
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
+#ifdef HAVE_SYS_POLL_H
+#include <sys/poll.h>
+#endif
 
 #include "pgstat.h"
 
 #include "access/heapam.h"
+#include "access/transam.h"
+#include "access/twophase_rmgr.h"
 #include "access/xact.h"
 #include "catalog/pg_database.h"
+#include "libpq/ip.h"
 #include "libpq/libpq.h"
 #include "libpq/pqsignal.h"
 #include "mb/pg_wchar.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
-#include "storage/procarray.h"
-#include "tcop/tcopprot.h"
-#include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
-#include "utils/rel.h"
-#include "utils/syscache.h"
 
 
 /* ----------
                                                                                 * failed statistics collector; in
                                                                                 * seconds. */
 
-/* ----------
- * Amount of space reserved in pgstat_recvbuffer().
- * ----------
- */
-#define PGSTAT_RECVBUFFERSZ            ((int) (1024 * sizeof(PgStat_Msg)))
+#define PGSTAT_SELECT_TIMEOUT  2               /* How often to check for postmaster
+                                                                                * death; in seconds. */
+
 
 /* ----------
  * The initial size hints for the hash tables used in the collector.
  * ----------
  */
 #define PGSTAT_DB_HASH_SIZE            16
-#define PGSTAT_BE_HASH_SIZE            512
 #define PGSTAT_TAB_HASH_SIZE   512
 
 
  */
 bool           pgstat_collect_startcollector = true;
 bool           pgstat_collect_resetonpmstart = false;
-bool           pgstat_collect_querystring = false;
 bool           pgstat_collect_tuplelevel = false;
 bool           pgstat_collect_blocklevel = false;
+bool           pgstat_collect_querystring = false;
+
+/*
+ * BgWriter global statistics counters (unused in other processes).
+ * Stored directly in a stats message structure so it can be sent
+ * without needing to copy things around.  We assume this inits to zeroes.
+ */
+PgStat_MsgBgWriter BgWriterStats;
 
 /* ----------
  * Local data
  * ----------
  */
 NON_EXEC_STATIC int pgStatSock = -1;
-NON_EXEC_STATIC int pgStatPipe[2] = {-1, -1};
+
 static struct sockaddr_storage pgStatAddr;
-static pid_t pgStatCollectorPid = 0;
 
 static time_t last_pgstat_start_time;
 
-static long pgStatNumMessages = 0;
-
 static bool pgStatRunningInCollector = false;
 
 /*
- * Place where backends store per-table info to be sent to the collector.
- * We store shared relations separately from non-shared ones, to be able to
- * send them in separate messages.
+ * Structures in which backends store per-table info that's waiting to be
+ * sent to the collector.
+ *
+ * NOTE: once allocated, TabStatusArray structures are never moved or deleted
+ * for the life of the backend.  Also, we zero out the t_id fields of the
+ * contained PgStat_TableStatus structs whenever they are not actively in use.
+ * This allows relcache pgstat_info pointers to be treated as long-lived data,
+ * avoiding repeated searches in pgstat_initstats() when a relation is
+ * repeatedly opened during a transaction.
  */
-typedef struct TabStatArray
+#define TABSTAT_QUANTUM                100                     /* we alloc this many at a time */
+
+typedef struct TabStatusArray
 {
-       int                     tsa_alloc;              /* num allocated */
-       int                     tsa_used;               /* num actually used */
-       PgStat_MsgTabstat **tsa_messages;       /* the array itself */
-} TabStatArray;
+       struct TabStatusArray *tsa_next;        /* link to next array, if any */
+       int                     tsa_used;                               /* # entries currently used */
+       PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM];        /* per-table data */
+} TabStatusArray;
+
+static TabStatusArray *pgStatTabList = NULL;
 
-#define TABSTAT_QUANTUM                4       /* we alloc this many at a time */
+/*
+ * Tuple insertion/deletion counts for an open transaction can't be propagated
+ * into PgStat_TableStatus counters until we know if it is going to commit
+ * or abort.  Hence, we keep these counts in per-subxact structs that live
+ * in TopTransactionContext.  This data structure is designed on the assumption
+ * that subxacts won't usually modify very many tables.
+ */
+typedef struct PgStat_SubXactStatus
+{
+       int                     nest_level;                             /* subtransaction nest level */
+       struct PgStat_SubXactStatus *prev;      /* higher-level subxact if any */
+       PgStat_TableXactStatus *first;          /* head of list for this subxact */
+} PgStat_SubXactStatus;
 
-static TabStatArray RegularTabStat = {0, 0, NULL};
-static TabStatArray SharedTabStat = {0, 0, NULL};
+static PgStat_SubXactStatus *pgStatXactStack = NULL;
 
 static int     pgStatXactCommit = 0;
 static int     pgStatXactRollback = 0;
 
-static TransactionId pgStatDBHashXact = InvalidTransactionId;
+/* Record that's written to 2PC state file when pgstat state is persisted */
+typedef struct TwoPhasePgStatRecord
+{
+       PgStat_Counter tuples_inserted; /* tuples inserted in xact */
+       PgStat_Counter tuples_deleted;  /* tuples deleted in xact */
+       Oid                     t_id;                           /* table's OID */
+       bool            t_shared;                       /* is it a shared catalog? */
+} TwoPhasePgStatRecord;
+
+/*
+ * Info about current "snapshot" of stats file
+ */
+static MemoryContext pgStatLocalContext = NULL;
 static HTAB *pgStatDBHash = NULL;
-static PgStat_StatBeEntry *pgStatBeTable = NULL;
-static int     pgStatNumBackends = 0;
+static PgBackendStatus *localBackendStatusTable = NULL;
+static int     localNumBackends = 0;
+
+/*
+ * Cluster wide statistics, kept in the stats collector.
+ * Contains statistics that are not collected per database
+ * or per table.
+ */
+static PgStat_GlobalStats globalStats;
 
-static volatile bool   need_statwrite;
+static volatile bool need_exit = false;
+static volatile bool need_statwrite = false;
 
 
 /* ----------
@@ -146,41 +192,30 @@ static volatile bool      need_statwrite;
  * ----------
  */
 #ifdef EXEC_BACKEND
-
-typedef enum STATS_PROCESS_TYPE
-{
-       STAT_PROC_BUFFER,
-       STAT_PROC_COLLECTOR
-}      STATS_PROCESS_TYPE;
-
-static pid_t pgstat_forkexec(STATS_PROCESS_TYPE procType);
-static void pgstat_parseArgs(int argc, char *argv[]);
+static pid_t pgstat_forkexec(void);
 #endif
 
-NON_EXEC_STATIC void PgstatBufferMain(int argc, char *argv[]);
 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]);
-static void force_statwrite(SIGNAL_ARGS);
-static void pgstat_recvbuffer(void);
 static void pgstat_exit(SIGNAL_ARGS);
-static void pgstat_die(SIGNAL_ARGS);
+static void force_statwrite(SIGNAL_ARGS);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
-static int     pgstat_add_backend(PgStat_MsgHdr *msg);
-static void pgstat_sub_backend(int procpid);
-static void pgstat_drop_database(Oid databaseid);
 static void pgstat_write_statsfile(void);
-static void pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
-                                         PgStat_StatBeEntry **betab,
-                                         int *numbackends);
+static HTAB *pgstat_read_statsfile(Oid onlydb);
 static void backend_read_statsfile(void);
+static void pgstat_read_current_status(void);
+
+static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
+static HTAB *pgstat_collect_oids(Oid catalogid);
+
+static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
+
+static void pgstat_setup_memcxt(void);
 
 static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
 static void pgstat_send(void *msg, int len);
 
-static void pgstat_recv_bestart(PgStat_MsgBestart *msg, int len);
-static void pgstat_recv_beterm(PgStat_MsgBeterm *msg, int len);
-static void pgstat_recv_activity(PgStat_MsgActivity *msg, int len);
 static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
 static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
 static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
@@ -188,6 +223,7 @@ static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
 static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
 static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
+static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
 
 
 /* ------------------------------------------------------------
@@ -217,14 +253,15 @@ pgstat_init(void)
        char            test_byte;
        int                     sel_res;
        int                     tries = 0;
-       
+
 #define TESTBYTEVAL ((char) 199)
 
        /*
-        * Force start of collector daemon if something to collect
+        * Force start of collector daemon if something to collect.  Note that
+        * pgstat_collect_querystring is now an independent facility that does not
+        * require the collector daemon.
         */
-       if (pgstat_collect_querystring ||
-               pgstat_collect_tuplelevel ||
+       if (pgstat_collect_tuplelevel ||
                pgstat_collect_blocklevel)
                pgstat_collect_startcollector = true;
 
@@ -279,8 +316,8 @@ pgstat_init(void)
 
                if (++tries > 1)
                        ereport(LOG,
-                               (errmsg("trying another address for the statistics collector")));
-               
+                       (errmsg("trying another address for the statistics collector")));
+
                /*
                 * Create the socket.
                 */
@@ -340,8 +377,12 @@ pgstat_init(void)
                 * rules prevent it).
                 */
                test_byte = TESTBYTEVAL;
+
+retry1:
                if (send(pgStatSock, &test_byte, 1, 0) != 1)
                {
+                       if (errno == EINTR)
+                               goto retry1;    /* if interrupted, just retry */
                        ereport(LOG,
                                        (errcode_for_socket_access(),
                                         errmsg("could not send test message on socket for statistics collector: %m")));
@@ -392,8 +433,11 @@ pgstat_init(void)
 
                test_byte++;                    /* just make sure variable is changed */
 
+retry2:
                if (recv(pgStatSock, &test_byte, 1, 0) != 1)
                {
+                       if (errno == EINTR)
+                               goto retry2;    /* if interrupted, just retry */
                        ereport(LOG,
                                        (errcode_for_socket_access(),
                                         errmsg("could not receive test message on socket for statistics collector: %m")));
@@ -422,9 +466,8 @@ pgstat_init(void)
 
        /*
         * Set the socket to non-blocking IO.  This ensures that if the collector
-        * falls behind (despite the buffering process), statistics messages will
-        * be discarded; backends won't block waiting to send messages to the
-        * collector.
+        * falls behind, statistics messages will be discarded; backends won't
+        * block waiting to send messages to the collector.
         */
        if (!pg_set_noblock(pgStatSock))
        {
@@ -451,7 +494,6 @@ startup_failed:
 
        /* Adjust GUC variables to suppress useless activity */
        pgstat_collect_startcollector = false;
-       pgstat_collect_querystring = false;
        pgstat_collect_tuplelevel = false;
        pgstat_collect_blocklevel = false;
 }
@@ -474,65 +516,23 @@ pgstat_reset_all(void)
 /*
  * pgstat_forkexec() -
  *
- * Format up the arglist for, then fork and exec, statistics
- * (buffer and collector) processes
+ * Format up the arglist for, then fork and exec, statistics collector process
  */
 static pid_t
-pgstat_forkexec(STATS_PROCESS_TYPE procType)
+pgstat_forkexec(void)
 {
        char       *av[10];
-       int                     ac = 0,
-                               bufc = 0,
-                               i;
-       char            pgstatBuf[2][32];
+       int                     ac = 0;
 
        av[ac++] = "postgres";
-
-       switch (procType)
-       {
-               case STAT_PROC_BUFFER:
-                       av[ac++] = "--forkbuf";
-                       break;
-
-               case STAT_PROC_COLLECTOR:
-                       av[ac++] = "--forkcol";
-                       break;
-
-               default:
-                       Assert(false);
-       }
-
+       av[ac++] = "--forkcol";
        av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
 
-       /* postgres_exec_path is not passed by write_backend_variables */
-       av[ac++] = postgres_exec_path;
-
-       /* Add to the arg list */
-       Assert(bufc <= lengthof(pgstatBuf));
-       for (i = 0; i < bufc; i++)
-               av[ac++] = pgstatBuf[i];
-
        av[ac] = NULL;
        Assert(ac < lengthof(av));
 
        return postmaster_forkexec(ac, av);
 }
-
-
-/*
- * pgstat_parseArgs() -
- *
- * Extract data from the arglist for exec'ed statistics
- * (buffer and collector) processes
- */
-static void
-pgstat_parseArgs(int argc, char *argv[])
-{
-       Assert(argc == 4);
-
-       argc = 3;
-       StrNCpy(postgres_exec_path, argv[argc++], MAXPGPATH);
-}
 #endif   /* EXEC_BACKEND */
 
 
@@ -591,14 +591,14 @@ pgstat_start(void)
         * Okay, fork off the collector.
         */
 #ifdef EXEC_BACKEND
-       switch ((pgStatPid = pgstat_forkexec(STAT_PROC_BUFFER)))
+       switch ((pgStatPid = pgstat_forkexec()))
 #else
        switch ((pgStatPid = fork_process()))
 #endif
        {
                case -1:
                        ereport(LOG,
-                                       (errmsg("could not fork statistics buffer: %m")));
+                                       (errmsg("could not fork statistics collector: %m")));
                        return 0;
 
 #ifndef EXEC_BACKEND
@@ -613,7 +613,7 @@ pgstat_start(void)
                        /* Drop our connection to postmaster's shared memory, as well */
                        PGSharedMemoryDetach();
 
-                       PgstatBufferMain(0, NULL);
+                       PgstatCollectorMain(0, NULL);
                        break;
 #endif
 
@@ -625,51 +625,9 @@ pgstat_start(void)
        return 0;
 }
 
-
-/* ----------
- * pgstat_beterm() -
- *
- *     Called from postmaster to tell collector a backend terminated.
- * ----------
- */
-void
-pgstat_beterm(int pid)
-{
-       PgStat_MsgBeterm msg;
-
-       if (pgStatSock < 0)
-               return;
-
-       /* can't use pgstat_setheader() because it's not called in a backend */
-       MemSet(&(msg.m_hdr), 0, sizeof(msg.m_hdr));
-       msg.m_hdr.m_type = PGSTAT_MTYPE_BETERM;
-       msg.m_hdr.m_procpid = pid;
-
-       pgstat_send(&msg, sizeof(msg));
-}
-
-
-/* ----------
- * pgstat_report_autovac() -
- *
- *     Called from autovacuum.c to report startup of an autovacuum process.
- *     We are called before InitPostgres is done, so can't rely on MyDatabaseId;
- *     the db OID must be passed in, instead.
- * ----------
- */
-void
-pgstat_report_autovac(Oid dboid)
+void allow_immediate_pgstat_restart(void)
 {
-       PgStat_MsgAutovacStart msg;
-
-       if (pgStatSock < 0)
-               return;
-
-       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
-       msg.m_databaseid = dboid;
-       msg.m_start_time = GetCurrentTimestamp();
-
-       pgstat_send(&msg, sizeof(msg));
+               last_pgstat_start_time = 0;
 }
 
 /* ------------------------------------------------------------
@@ -679,202 +637,138 @@ pgstat_report_autovac(Oid dboid)
 
 
 /* ----------
- * pgstat_bestart() -
+ * pgstat_report_tabstat() -
  *
- *     Tell the collector that this new backend is soon ready to process
- *     queries. Called from InitPostgres.
+ *     Called from tcop/postgres.c to send the so far collected per-table
+ *     access statistics to the collector.  Note that this is called only
+ *     when not within a transaction, so it is fair to use transaction stop
+ *     time as an approximation of current time.
  * ----------
  */
 void
-pgstat_bestart(void)
+pgstat_report_tabstat(bool force)
 {
-       PgStat_MsgBestart msg;
+       /* we assume this inits to all zeroes: */
+       static const PgStat_TableCounts all_zeroes;
+       static TimestampTz last_report = 0;     
+
+       TimestampTz now;
+       PgStat_MsgTabstat regular_msg;
+       PgStat_MsgTabstat shared_msg;
+       TabStatusArray *tsa;
+       int                     i;
 
-       if (pgStatSock < 0)
+       /* Don't expend a clock check if nothing to do */
+       if (pgStatTabList == NULL ||
+               pgStatTabList->tsa_used == 0)
                return;
 
        /*
-        * We may not have a MyProcPort (eg, if this is the autovacuum process).
-        * Send an all-zeroes client address, which is dealt with specially in
-        * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port.
+        * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
+        * msec since we last sent one, or the caller wants to force stats out.
         */
-       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_BESTART);
-       msg.m_databaseid = MyDatabaseId;
-       msg.m_userid = GetSessionUserId();
-       if (MyProcPort)
-               memcpy(&msg.m_clientaddr, &MyProcPort->raddr, sizeof(msg.m_clientaddr));
-       else
-               MemSet(&msg.m_clientaddr, 0, sizeof(msg.m_clientaddr));
-       pgstat_send(&msg, sizeof(msg));
+       now = GetCurrentTransactionStopTimestamp();
+       if (!force &&
+               !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL))
+               return;
+       last_report = now;
 
        /*
-        * Set up a process-exit hook to ensure we flush the last batch of
-        * statistics to the collector.
+        * Scan through the TabStatusArray struct(s) to find tables that actually
+        * have counts, and build messages to send.  We have to separate shared
+        * relations from regular ones because the databaseid field in the
+        * message header has to depend on that.
         */
-       on_shmem_exit(pgstat_beshutdown_hook, 0);
-}
-
-/* ---------
- * pgstat_report_vacuum() -
- *
- *     Tell the collector about the table we just vacuumed.
- * ---------
- */
-void
-pgstat_report_vacuum(Oid tableoid, bool shared,
-                                        bool analyze, PgStat_Counter tuples)
-{
-       PgStat_MsgVacuum msg;
-
-       if (pgStatSock < 0 ||
-               !pgstat_collect_tuplelevel)
-               return;
+       regular_msg.m_databaseid = MyDatabaseId;
+       shared_msg.m_databaseid = InvalidOid;
+       regular_msg.m_nentries = 0;
+       shared_msg.m_nentries = 0;
 
-       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
-       msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
-       msg.m_tableoid = tableoid;
-       msg.m_analyze = analyze;
-       msg.m_autovacuum = IsAutoVacuumProcess(); /* is this autovacuum? */
-       msg.m_vacuumtime = GetCurrentTimestamp();
-       msg.m_tuples = tuples;
-       pgstat_send(&msg, sizeof(msg));
-}
+       for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
+       {
+               for (i = 0; i < tsa->tsa_used; i++)
+               {
+                       PgStat_TableStatus *entry = &tsa->tsa_entries[i];
+                       PgStat_MsgTabstat *this_msg;
+                       PgStat_TableEntry *this_ent;
 
-/* --------
- * pgstat_report_analyze() -
- *
- *     Tell the collector about the table we just analyzed.
- * --------
- */
-void
-pgstat_report_analyze(Oid tableoid, bool shared, PgStat_Counter livetuples,
-                                         PgStat_Counter deadtuples)
-{
-       PgStat_MsgAnalyze msg;
+                       /* Shouldn't have any pending transaction-dependent counts */
+                       Assert(entry->trans == NULL);
 
-       if (pgStatSock < 0 ||
-               !pgstat_collect_tuplelevel)
-               return;
+                       /*
+                        * Ignore entries that didn't accumulate any actual counts,
+                        * such as indexes that were opened by the planner but not used.
+                        */
+                       if (memcmp(&entry->t_counts, &all_zeroes,
+                                          sizeof(PgStat_TableCounts)) == 0)
+                               continue;
+                       /*
+                        * OK, insert data into the appropriate message, and send if full.
+                        */
+                       this_msg = entry->t_shared ? &shared_msg : &regular_msg;
+                       this_ent = &this_msg->m_entry[this_msg->m_nentries];
+                       this_ent->t_id = entry->t_id;
+                       memcpy(&this_ent->t_counts, &entry->t_counts,
+                                  sizeof(PgStat_TableCounts));
+                       if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
+                       {
+                               pgstat_send_tabstat(this_msg);
+                               this_msg->m_nentries = 0;
+                       }
+               }
+               /* zero out TableStatus structs after use */
+               MemSet(tsa->tsa_entries, 0,
+                          tsa->tsa_used * sizeof(PgStat_TableStatus));
+               tsa->tsa_used = 0;
+       }
 
-       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
-       msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
-       msg.m_tableoid = tableoid;
-       msg.m_autovacuum = IsAutoVacuumProcess(); /* is this autovacuum? */
-       msg.m_analyzetime = GetCurrentTimestamp();
-       msg.m_live_tuples = livetuples;
-       msg.m_dead_tuples = deadtuples;
-       pgstat_send(&msg, sizeof(msg));
+       /*
+        * Send partial messages.  If force is true, make sure that any pending
+        * xact commit/abort gets counted, even if no table stats to send.
+        */
+       if (regular_msg.m_nentries > 0 ||
+               (force && (pgStatXactCommit > 0 || pgStatXactRollback > 0)))
+               pgstat_send_tabstat(&regular_msg);
+       if (shared_msg.m_nentries > 0)
+               pgstat_send_tabstat(&shared_msg);
 }
 
 /*
- * Flush any remaining statistics counts out to the collector at process
- * exit.   Without this, operations triggered during backend exit (such as
- * temp table deletions) won't be counted.
+ * Subroutine for pgstat_report_tabstat: finish and send a tabstat message
  */
 static void
-pgstat_beshutdown_hook(int code, Datum arg)
-{
-       pgstat_report_tabstat();
-}
-
-
-/* ----------
- * pgstat_report_activity() -
- *
- *     Called from tcop/postgres.c to tell the collector what the backend
- *     is actually doing (usually "<IDLE>" or the start of the query to
- *     be executed).
- * ----------
- */
-void
-pgstat_report_activity(const char *cmd_str)
+pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
 {
-       PgStat_MsgActivity msg;
+       int                     n;
        int                     len;
 
-       if (!pgstat_collect_querystring || pgStatSock < 0)
-               return;
-
-       len = strlen(cmd_str);
-       len = pg_mbcliplen(cmd_str, len, PGSTAT_ACTIVITY_SIZE - 1);
-
-       memcpy(msg.m_cmd_str, cmd_str, len);
-       msg.m_cmd_str[len] = '\0';
-       len += offsetof(PgStat_MsgActivity, m_cmd_str) + 1;
-
-       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ACTIVITY);
-       pgstat_send(&msg, len);
-}
-
-
-/* ----------
- * pgstat_report_tabstat() -
- *
- *     Called from tcop/postgres.c to send the so far collected
- *     per table access statistics to the collector.
- * ----------
- */
-void
-pgstat_report_tabstat(void)
-{
-       int                     i;
-
-       if (pgStatSock < 0 ||
-               (!pgstat_collect_querystring &&
-                !pgstat_collect_tuplelevel &&
-                !pgstat_collect_blocklevel))
-       {
-               /* Not reporting stats, so just flush whatever we have */
-               RegularTabStat.tsa_used = 0;
-               SharedTabStat.tsa_used = 0;
+       /* It's unlikely we'd get here with no socket, but maybe not impossible */
+       if (pgStatSock < 0)
                return;
-       }
 
        /*
-        * For each message buffer used during the last query set the header
-        * fields and send it out.
+        * Report accumulated xact commit/rollback whenever we send a normal
+        * tabstat message
         */
-       for (i = 0; i < RegularTabStat.tsa_used; i++)
+       if (OidIsValid(tsmsg->m_databaseid))
        {
-               PgStat_MsgTabstat *tsmsg = RegularTabStat.tsa_messages[i];
-               int                     n;
-               int                     len;
-
-               n = tsmsg->m_nentries;
-               len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
-                       n * sizeof(PgStat_TableEntry);
-
                tsmsg->m_xact_commit = pgStatXactCommit;
                tsmsg->m_xact_rollback = pgStatXactRollback;
                pgStatXactCommit = 0;
                pgStatXactRollback = 0;
-
-               pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
-               tsmsg->m_databaseid = MyDatabaseId;
-               pgstat_send(tsmsg, len);
        }
-       RegularTabStat.tsa_used = 0;
-
-       /* Ditto, for shared relations */
-       for (i = 0; i < SharedTabStat.tsa_used; i++)
+       else
        {
-               PgStat_MsgTabstat *tsmsg = SharedTabStat.tsa_messages[i];
-               int                     n;
-               int                     len;
-
-               n = tsmsg->m_nentries;
-               len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
-                       n * sizeof(PgStat_TableEntry);
-
-               /* We don't report transaction commit/abort here */
                tsmsg->m_xact_commit = 0;
                tsmsg->m_xact_rollback = 0;
-
-               pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
-               tsmsg->m_databaseid = InvalidOid;
-               pgstat_send(tsmsg, len);
        }
-       SharedTabStat.tsa_used = 0;
+
+       n = tsmsg->m_nentries;
+       len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
+               n * sizeof(PgStat_TableEntry);
+
+       pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
+       pgstat_send(tsmsg, len);
 }
 
 
@@ -887,10 +781,7 @@ pgstat_report_tabstat(void)
 void
 pgstat_vacuum_tabstat(void)
 {
-       List       *oidlist;
-       Relation        rel;
-       HeapScanDesc scan;
-       HeapTuple       tup;
+       HTAB       *htab;
        PgStat_MsgTabpurge msg;
        HASH_SEQ_STATUS hstat;
        PgStat_StatDBEntry *dbentry;
@@ -909,15 +800,7 @@ pgstat_vacuum_tabstat(void)
        /*
         * Read pg_database and make a list of OIDs of all existing databases
         */
-       oidlist = NIL;
-       rel = heap_open(DatabaseRelationId, AccessShareLock);
-       scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
-       while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
-       {
-               oidlist = lappend_oid(oidlist, HeapTupleGetOid(tup));
-       }
-       heap_endscan(scan);
-       heap_close(rel, AccessShareLock);
+       htab = pgstat_collect_oids(DatabaseRelationId);
 
        /*
         * Search the database hash table for dead databases and tell the
@@ -928,12 +811,14 @@ pgstat_vacuum_tabstat(void)
        {
                Oid                     dbid = dbentry->databaseid;
 
-               if (!list_member_oid(oidlist, dbid))
+               CHECK_FOR_INTERRUPTS();
+
+               if (hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL)
                        pgstat_drop_database(dbid);
        }
 
        /* Clean up */
-       list_free(oidlist);
+       hash_destroy(htab);
 
        /*
         * Lookup our own database entry; if not found, nothing more to do.
@@ -947,15 +832,7 @@ pgstat_vacuum_tabstat(void)
        /*
         * Similarly to above, make a list of all known relations in this DB.
         */
-       oidlist = NIL;
-       rel = heap_open(RelationRelationId, AccessShareLock);
-       scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
-       while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
-       {
-               oidlist = lappend_oid(oidlist, HeapTupleGetOid(tup));
-       }
-       heap_endscan(scan);
-       heap_close(rel, AccessShareLock);
+       htab = pgstat_collect_oids(RelationRelationId);
 
        /*
         * Initialize our messages table counter to zero
@@ -968,13 +845,17 @@ pgstat_vacuum_tabstat(void)
        hash_seq_init(&hstat, dbentry->tables);
        while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
        {
-               if (list_member_oid(oidlist, tabentry->tableid))
+               Oid                     tabid = tabentry->tableid;
+
+               CHECK_FOR_INTERRUPTS();
+
+               if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL)
                        continue;
 
                /*
                 * Not there, so add this table's Oid to the message
                 */
-               msg.m_tableid[msg.m_nentries++] = tabentry->tableid;
+               msg.m_tableid[msg.m_nentries++] = tabid;
 
                /*
                 * If the message is full, send it out and reinitialize to empty
@@ -1006,33 +887,76 @@ pgstat_vacuum_tabstat(void)
        }
 
        /* Clean up */
-       list_free(oidlist);
+       hash_destroy(htab);
 }
 
 
 /* ----------
- * pgstat_drop_database() -
+ * pgstat_collect_oids() -
  *
- *     Tell the collector that we just dropped a database.
- *     (If the message gets lost, we will still clean the dead DB eventually
- *     via future invocations of pgstat_vacuum_tabstat().)
+ *     Collect the OIDs of either all databases or all tables, according to
+ *     the parameter, into a temporary hash table.  Caller should hash_destroy
+ *     the result when done with it.
  * ----------
  */
-static void
-pgstat_drop_database(Oid databaseid)
+static HTAB *
+pgstat_collect_oids(Oid catalogid)
 {
-       PgStat_MsgDropdb msg;
+       HTAB       *htab;
+       HASHCTL         hash_ctl;
+       Relation        rel;
+       HeapScanDesc scan;
+       HeapTuple       tup;
 
-       if (pgStatSock < 0)
-               return;
+       memset(&hash_ctl, 0, sizeof(hash_ctl));
+       hash_ctl.keysize = sizeof(Oid);
+       hash_ctl.entrysize = sizeof(Oid);
+       hash_ctl.hash = oid_hash;
+       htab = hash_create("Temporary table of OIDs",
+                                          PGSTAT_TAB_HASH_SIZE,
+                                          &hash_ctl,
+                                          HASH_ELEM | HASH_FUNCTION);
 
-       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB);
-       msg.m_databaseid = databaseid;
-       pgstat_send(&msg, sizeof(msg));
-}
+       rel = heap_open(catalogid, AccessShareLock);
+       scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
+       while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
+       {
+               Oid             thisoid = HeapTupleGetOid(tup);
 
+               CHECK_FOR_INTERRUPTS();
 
-/* ----------
+               (void) hash_search(htab, (void *) &thisoid, HASH_ENTER, NULL);
+       }
+       heap_endscan(scan);
+       heap_close(rel, AccessShareLock);
+
+       return htab;
+}
+
+
+/* ----------
+ * pgstat_drop_database() -
+ *
+ *     Tell the collector that we just dropped a database.
+ *     (If the message gets lost, we will still clean the dead DB eventually
+ *     via future invocations of pgstat_vacuum_tabstat().)
+ * ----------
+ */
+void
+pgstat_drop_database(Oid databaseid)
+{
+       PgStat_MsgDropdb msg;
+
+       if (pgStatSock < 0)
+               return;
+
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB);
+       msg.m_databaseid = databaseid;
+       pgstat_send(&msg, sizeof(msg));
+}
+
+
+/* ----------
  * pgstat_drop_relation() -
  *
  *     Tell the collector that we just dropped a relation.
@@ -1052,7 +976,7 @@ pgstat_drop_relation(Oid relid)
        msg.m_tableid[0] = relid;
        msg.m_nentries = 1;
 
-       len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) + sizeof(Oid);
+       len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) +sizeof(Oid);
 
        pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
        msg.m_databaseid = MyDatabaseId;
@@ -1086,203 +1010,581 @@ pgstat_reset_counters(void)
 
 
 /* ----------
- * pgstat_ping() -
+ * pgstat_report_autovac() -
  *
- *     Send some junk data to the collector to increase traffic.
+ *     Called from autovacuum.c to report startup of an autovacuum process.
+ *     We are called before InitPostgres is done, so can't rely on MyDatabaseId;
+ *     the db OID must be passed in, instead.
  * ----------
  */
 void
-pgstat_ping(void)
+pgstat_report_autovac(Oid dboid)
 {
-       PgStat_MsgDummy msg;
+       PgStat_MsgAutovacStart msg;
 
        if (pgStatSock < 0)
                return;
 
-       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
+       msg.m_databaseid = dboid;
+       msg.m_start_time = GetCurrentTimestamp();
+
        pgstat_send(&msg, sizeof(msg));
 }
 
-/*
- * Enlarge a TabStatArray
+
+/* ---------
+ * pgstat_report_vacuum() -
+ *
+ *     Tell the collector about the table we just vacuumed.
+ * ---------
  */
-static void
-more_tabstat_space(TabStatArray *tsarr)
+void
+pgstat_report_vacuum(Oid tableoid, bool shared,
+                                        bool analyze, PgStat_Counter tuples)
 {
-       PgStat_MsgTabstat *newMessages;
-       PgStat_MsgTabstat **msgArray;
-       int                     newAlloc;
-       int                     i;
+       PgStat_MsgVacuum msg;
 
-       AssertArg(PointerIsValid(tsarr));
+       if (pgStatSock < 0 ||
+               !pgstat_collect_tuplelevel)
+               return;
 
-       newAlloc = tsarr->tsa_alloc + TABSTAT_QUANTUM;
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
+       msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
+       msg.m_tableoid = tableoid;
+       msg.m_analyze = analyze;
+       msg.m_autovacuum = IsAutoVacuumWorkerProcess(); /* is this autovacuum? */
+       msg.m_vacuumtime = GetCurrentTimestamp();
+       msg.m_tuples = tuples;
+       pgstat_send(&msg, sizeof(msg));
+}
 
-       /* Create (another) quantum of message buffers */
-       newMessages = (PgStat_MsgTabstat *)
-               MemoryContextAllocZero(TopMemoryContext,
-                                                          sizeof(PgStat_MsgTabstat) * TABSTAT_QUANTUM);
+/* --------
+ * pgstat_report_analyze() -
+ *
+ *     Tell the collector about the table we just analyzed.
+ * --------
+ */
+void
+pgstat_report_analyze(Oid tableoid, bool shared, PgStat_Counter livetuples,
+                                         PgStat_Counter deadtuples)
+{
+       PgStat_MsgAnalyze msg;
 
-       /* Create or enlarge the pointer array */
-       if (tsarr->tsa_messages == NULL)
-               msgArray = (PgStat_MsgTabstat **)
-                       MemoryContextAlloc(TopMemoryContext,
-                                                          sizeof(PgStat_MsgTabstat *) * newAlloc);
-       else
-               msgArray = (PgStat_MsgTabstat **)
-                       repalloc(tsarr->tsa_messages,
-                                        sizeof(PgStat_MsgTabstat *) * newAlloc);
+       if (pgStatSock < 0 ||
+               !pgstat_collect_tuplelevel)
+               return;
 
-       for (i = 0; i < TABSTAT_QUANTUM; i++)
-               msgArray[tsarr->tsa_alloc + i] = newMessages++;
-       tsarr->tsa_messages = msgArray;
-       tsarr->tsa_alloc = newAlloc;
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
+       msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
+       msg.m_tableoid = tableoid;
+       msg.m_autovacuum = IsAutoVacuumWorkerProcess(); /* is this autovacuum? */
+       msg.m_analyzetime = GetCurrentTimestamp();
+       msg.m_live_tuples = livetuples;
+       msg.m_dead_tuples = deadtuples;
+       pgstat_send(&msg, sizeof(msg));
+}
+
+
+/* ----------
+ * pgstat_ping() -
+ *
+ *     Send some junk data to the collector to increase traffic.
+ * ----------
+ */
+void
+pgstat_ping(void)
+{
+       PgStat_MsgDummy msg;
 
-       Assert(tsarr->tsa_used < tsarr->tsa_alloc);
+       if (pgStatSock < 0)
+               return;
+
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
+       pgstat_send(&msg, sizeof(msg));
 }
 
+
 /* ----------
  * pgstat_initstats() -
  *
- *     Called from various places usually dealing with initialization
- *     of Relation or Scan structures. The data placed into these
- *     structures from here tell where later to count for buffer reads,
- *     scans and tuples fetched.
+ *     Initialize a relcache entry to count access statistics.
+ *     Called whenever a relation is opened.
+ *
+ *     We assume that a relcache entry's pgstat_info field is zeroed by
+ *     relcache.c when the relcache entry is made; thereafter it is long-lived
+ *     data.  We can avoid repeated searches of the TabStatus arrays when the
+ *     same relation is touched repeatedly within a transaction.
  * ----------
  */
 void
-pgstat_initstats(PgStat_Info *stats, Relation rel)
+pgstat_initstats(Relation rel)
 {
        Oid                     rel_id = rel->rd_id;
-       PgStat_TableEntry *useent;
-       TabStatArray *tsarr;
-       PgStat_MsgTabstat *tsmsg;
-       int                     mb;
-       int                     i;
+       char            relkind = rel->rd_rel->relkind;
 
-       /*
-        * Initialize data not to count at all.
-        */
-       stats->tabentry = NULL;
+       /* We only count stats for things that have storage */
+       if (!(relkind == RELKIND_RELATION ||
+                 relkind == RELKIND_INDEX ||
+                 relkind == RELKIND_TOASTVALUE))
+       {
+               rel->pgstat_info = NULL;
+               return;
+       }
 
        if (pgStatSock < 0 ||
                !(pgstat_collect_tuplelevel ||
                  pgstat_collect_blocklevel))
+       {
+               /* We're not counting at all */
+               rel->pgstat_info = NULL;
                return;
+       }
 
-       tsarr = rel->rd_rel->relisshared ? &SharedTabStat : &RegularTabStat;
+       /*
+        * If we already set up this relation in the current transaction,
+        * nothing to do.
+        */
+       if (rel->pgstat_info != NULL &&
+               rel->pgstat_info->t_id == rel_id)
+               return;
+
+       /* Else find or make the PgStat_TableStatus entry, and update link */
+       rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
+}
+
+/*
+ * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
+ */
+static PgStat_TableStatus *
+get_tabstat_entry(Oid rel_id, bool isshared)
+{
+       PgStat_TableStatus *entry;
+       TabStatusArray *tsa;
+       TabStatusArray *prev_tsa;
+       int                     i;
 
        /*
-        * Search the already-used message slots for this relation.
+        * Search the already-used tabstat slots for this relation.
         */
-       for (mb = 0; mb < tsarr->tsa_used; mb++)
+       prev_tsa = NULL;
+       for (tsa = pgStatTabList; tsa != NULL; prev_tsa = tsa, tsa = tsa->tsa_next)
        {
-               tsmsg = tsarr->tsa_messages[mb];
-
-               for (i = tsmsg->m_nentries; --i >= 0;)
+               for (i = 0; i < tsa->tsa_used; i++)
                {
-                       if (tsmsg->m_entry[i].t_id == rel_id)
-                       {
-                               stats->tabentry = (void *) &(tsmsg->m_entry[i]);
-                               return;
-                       }
+                       entry = &tsa->tsa_entries[i];
+                       if (entry->t_id == rel_id)
+                               return entry;
                }
 
-               if (tsmsg->m_nentries >= PGSTAT_NUM_TABENTRIES)
-                       continue;
-
-               /*
-                * Not found, but found a message buffer with an empty slot instead.
-                * Fine, let's use this one.
-                */
-               i = tsmsg->m_nentries++;
-               useent = &tsmsg->m_entry[i];
-               MemSet(useent, 0, sizeof(PgStat_TableEntry));
-               useent->t_id = rel_id;
-               stats->tabentry = (void *) useent;
-               return;
+               if (tsa->tsa_used < TABSTAT_QUANTUM)
+               {
+                       /*
+                        * It must not be present, but we found a free slot instead.
+                        * Fine, let's use this one.  We assume the entry was already
+                        * zeroed, either at creation or after last use.
+                        */
+                       entry = &tsa->tsa_entries[tsa->tsa_used++];
+                       entry->t_id = rel_id;
+                       entry->t_shared = isshared;
+                       return entry;
+               }
        }
 
        /*
-        * If we ran out of message buffers, we just allocate more.
+        * We ran out of tabstat slots, so allocate more.  Be sure they're zeroed.
+        */
+       tsa = (TabStatusArray *) MemoryContextAllocZero(TopMemoryContext,
+                                                                                                       sizeof(TabStatusArray));
+       if (prev_tsa)
+               prev_tsa->tsa_next = tsa;
+       else
+               pgStatTabList = tsa;
+
+       /*
+        * Use the first entry of the new TabStatusArray.
         */
-       if (tsarr->tsa_used >= tsarr->tsa_alloc)
-               more_tabstat_space(tsarr);
+       entry = &tsa->tsa_entries[tsa->tsa_used++];
+       entry->t_id = rel_id;
+       entry->t_shared = isshared;
+       return entry;
+}
+
+/*
+ * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed
+ */
+static PgStat_SubXactStatus *
+get_tabstat_stack_level(int nest_level)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       xact_state = pgStatXactStack;
+       if (xact_state == NULL || xact_state->nest_level != nest_level)
+       {
+               xact_state = (PgStat_SubXactStatus *)
+                       MemoryContextAlloc(TopTransactionContext,
+                                                          sizeof(PgStat_SubXactStatus));
+               xact_state->nest_level = nest_level;
+               xact_state->prev = pgStatXactStack;
+               xact_state->first = NULL;
+               pgStatXactStack = xact_state;
+       }
+       return xact_state;
+}
+
+/*
+ * add_tabstat_xact_level - add a new (sub)transaction state record
+ */
+static void
+add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
+{
+       PgStat_SubXactStatus *xact_state;
+       PgStat_TableXactStatus *trans;
 
        /*
-        * Use the first entry of the next message buffer.
+        * If this is the first rel to be modified at the current nest level,
+        * we first have to push a transaction stack entry.
         */
-       mb = tsarr->tsa_used++;
-       tsmsg = tsarr->tsa_messages[mb];
-       tsmsg->m_nentries = 1;
-       useent = &tsmsg->m_entry[0];
-       MemSet(useent, 0, sizeof(PgStat_TableEntry));
-       useent->t_id = rel_id;
-       stats->tabentry = (void *) useent;
+       xact_state = get_tabstat_stack_level(nest_level);
+
+       /* Now make a per-table stack entry */
+       trans = (PgStat_TableXactStatus *)
+               MemoryContextAllocZero(TopTransactionContext,
+                                                          sizeof(PgStat_TableXactStatus));
+       trans->nest_level = nest_level;
+       trans->upper = pgstat_info->trans;
+       trans->parent = pgstat_info;
+       trans->next = xact_state->first;
+       xact_state->first = trans;
+       pgstat_info->trans = trans;
+}
+
+/*
+ * pgstat_count_heap_insert - count a tuple insertion
+ */
+void
+pgstat_count_heap_insert(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+       {
+               int             nest_level = GetCurrentTransactionNestLevel();
+
+               /* t_tuples_inserted is nontransactional, so just advance it */
+               pgstat_info->t_counts.t_tuples_inserted++;
+
+               /* We have to log the transactional effect at the proper level */
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               pgstat_info->trans->tuples_inserted++;
+       }
+}
+
+/*
+ * pgstat_count_heap_update - count a tuple update
+ */
+void
+pgstat_count_heap_update(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+       {
+               int             nest_level = GetCurrentTransactionNestLevel();
+
+               /* t_tuples_updated is nontransactional, so just advance it */
+               pgstat_info->t_counts.t_tuples_updated++;
+
+               /* We have to log the transactional effect at the proper level */
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               /* An UPDATE both inserts a new tuple and deletes the old */
+               pgstat_info->trans->tuples_inserted++;
+               pgstat_info->trans->tuples_deleted++;
+       }
+}
+
+/*
+ * pgstat_count_heap_delete - count a tuple deletion
+ */
+void
+pgstat_count_heap_delete(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+       {
+               int             nest_level = GetCurrentTransactionNestLevel();
+
+               /* t_tuples_deleted is nontransactional, so just advance it */
+               pgstat_info->t_counts.t_tuples_deleted++;
+
+               /* We have to log the transactional effect at the proper level */
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               pgstat_info->trans->tuples_deleted++;
+       }
 }
 
 
 /* ----------
- * pgstat_count_xact_commit() -
+ * AtEOXact_PgStat
  *
- *     Called from access/transam/xact.c to count transaction commits.
+ *     Called from access/transam/xact.c at top-level transaction commit/abort.
  * ----------
  */
 void
-pgstat_count_xact_commit(void)
+AtEOXact_PgStat(bool isCommit)
 {
-       if      (!pgstat_collect_querystring &&
-                !pgstat_collect_tuplelevel &&
-                !pgstat_collect_blocklevel)
-               return;
-
-       pgStatXactCommit++;
+       PgStat_SubXactStatus *xact_state;
 
        /*
-        * If there was no relation activity yet, just make one existing message
-        * buffer used without slots, causing the next report to tell new
-        * xact-counters.
+        * Count transaction commit or abort.  (We use counters, not just bools,
+        * in case the reporting message isn't sent right away.)
         */
-       if (RegularTabStat.tsa_alloc == 0)
-               more_tabstat_space(&RegularTabStat);
+       if (isCommit)
+               pgStatXactCommit++;
+       else
+               pgStatXactRollback++;
 
-       if (RegularTabStat.tsa_used == 0)
+       /*
+        * Transfer transactional insert/update counts into the base tabstat
+        * entries.  We don't bother to free any of the transactional state,
+        * since it's all in TopTransactionContext and will go away anyway.
+        */
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
        {
-               RegularTabStat.tsa_used++;
-               RegularTabStat.tsa_messages[0]->m_nentries = 0;
+               PgStat_TableXactStatus *trans;
+
+               Assert(xact_state->nest_level == 1);
+               Assert(xact_state->prev == NULL);
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       Assert(trans->nest_level == 1);
+                       Assert(trans->upper == NULL);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+                       if (isCommit)
+                       {
+                               tabstat->t_counts.t_new_live_tuples += trans->tuples_inserted;
+                               tabstat->t_counts.t_new_dead_tuples += trans->tuples_deleted;
+                       }
+                       else
+                       {
+                               /* inserted tuples are dead, deleted tuples are unaffected */
+                               tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
+                       }
+                       tabstat->trans = NULL;
+               }
        }
-}
+       pgStatXactStack = NULL;
 
+       /* Make sure any stats snapshot is thrown away */
+       pgstat_clear_snapshot();
+}
 
 /* ----------
- * pgstat_count_xact_rollback() -
+ * AtEOSubXact_PgStat
  *
- *     Called from access/transam/xact.c to count transaction rollbacks.
+ *     Called from access/transam/xact.c at subtransaction commit/abort.
  * ----------
  */
 void
-pgstat_count_xact_rollback(void)
+AtEOSubXact_PgStat(bool isCommit, int nestDepth)
 {
-       if      (!pgstat_collect_querystring &&
-                !pgstat_collect_tuplelevel &&
-                !pgstat_collect_blocklevel)
-               return;
-
-       pgStatXactRollback++;
+       PgStat_SubXactStatus *xact_state;
 
        /*
-        * If there was no relation activity yet, just make one existing message
-        * buffer used without slots, causing the next report to tell new
-        * xact-counters.
+        * Transfer transactional insert/update counts into the next higher
+        * subtransaction state.
         */
-       if (RegularTabStat.tsa_alloc == 0)
-               more_tabstat_space(&RegularTabStat);
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL &&
+               xact_state->nest_level >= nestDepth)
+       {
+               PgStat_TableXactStatus *trans;
+               PgStat_TableXactStatus *next_trans;
+
+               /* delink xact_state from stack immediately to simplify reuse case */
+               pgStatXactStack = xact_state->prev;
+
+               for (trans = xact_state->first; trans != NULL; trans = next_trans)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       next_trans = trans->next;
+                       Assert(trans->nest_level == nestDepth);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+                       if (isCommit)
+                       {
+                               if (trans->upper && trans->upper->nest_level == nestDepth - 1)
+                               {
+                                       trans->upper->tuples_inserted += trans->tuples_inserted;
+                                       trans->upper->tuples_deleted += trans->tuples_deleted;
+                                       tabstat->trans = trans->upper;
+                                       pfree(trans);
+                               }
+                               else
+                               {
+                                       /*
+                                        * When there isn't an immediate parent state, we can
+                                        * just reuse the record instead of going through a
+                                        * palloc/pfree pushup (this works since it's all in
+                                        * TopTransactionContext anyway).  We have to re-link
+                                        * it into the parent level, though, and that might mean
+                                        * pushing a new entry into the pgStatXactStack.
+                                        */
+                                       PgStat_SubXactStatus *upper_xact_state;
 
-       if (RegularTabStat.tsa_used == 0)
+                                       upper_xact_state = get_tabstat_stack_level(nestDepth - 1);
+                                       trans->next = upper_xact_state->first;
+                                       upper_xact_state->first = trans;
+                                       trans->nest_level = nestDepth - 1;
+                               }
+                       }
+                       else
+                       {
+                               /*
+                                * On abort, inserted tuples are dead (and can be bounced out
+                                * to the top-level tabstat), deleted tuples are unaffected
+                                */
+                               tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
+                               tabstat->trans = trans->upper;
+                               pfree(trans);
+                       }
+               }
+               pfree(xact_state);
+       }
+}
+
+
+/*
+ * AtPrepare_PgStat
+ *             Save the transactional stats state at 2PC transaction prepare.
+ *
+ * In this phase we just generate 2PC records for all the pending
+ * transaction-dependent stats work.
+ */
+void
+AtPrepare_PgStat(void)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
+       {
+               PgStat_TableXactStatus *trans;
+
+               Assert(xact_state->nest_level == 1);
+               Assert(xact_state->prev == NULL);
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+                       TwoPhasePgStatRecord record;
+
+                       Assert(trans->nest_level == 1);
+                       Assert(trans->upper == NULL);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+
+                       record.tuples_inserted = trans->tuples_inserted;
+                       record.tuples_deleted = trans->tuples_deleted;
+                       record.t_id = tabstat->t_id;
+                       record.t_shared = tabstat->t_shared;
+
+                       RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0,
+                                                                  &record, sizeof(TwoPhasePgStatRecord));
+               }
+       }
+}
+
+/*
+ * PostPrepare_PgStat
+ *             Clean up after successful PREPARE.
+ *
+ * All we need do here is unlink the transaction stats state from the
+ * nontransactional state.  The nontransactional action counts will be
+ * reported to the stats collector immediately, while the effects on live
+ * and dead tuple counts are preserved in the 2PC state file.
+ *
+ * Note: AtEOXact_PgStat is not called during PREPARE.
+ */
+void
+PostPrepare_PgStat(void)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       /*
+        * We don't bother to free any of the transactional state,
+        * since it's all in TopTransactionContext and will go away anyway.
+        */
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
        {
-               RegularTabStat.tsa_used++;
-               RegularTabStat.tsa_messages[0]->m_nentries = 0;
+               PgStat_TableXactStatus *trans;
+
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       tabstat = trans->parent;
+                       tabstat->trans = NULL;
+               }
        }
+       pgStatXactStack = NULL;
+
+       /* Make sure any stats snapshot is thrown away */
+       pgstat_clear_snapshot();
+}
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state.
+ */
+void
+pgstat_twophase_postcommit(TransactionId xid, uint16 info,
+                                                  void *recdata, uint32 len)
+{
+       TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+       PgStat_TableStatus *pgstat_info;
+
+       /* Find or create a tabstat entry for the rel */
+       pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+       pgstat_info->t_counts.t_new_live_tuples += rec->tuples_inserted;
+       pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_deleted;
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state, but treat them
+ * as aborted.
+ */
+void
+pgstat_twophase_postabort(TransactionId xid, uint16 info,
+                                                 void *recdata, uint32 len)
+{
+       TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+       PgStat_TableStatus *pgstat_info;
+
+       /* Find or create a tabstat entry for the rel */
+       pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+       /* inserted tuples are dead, deleted tuples are no-ops */
+       pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_inserted;
 }
 
 
@@ -1375,40 +1677,365 @@ pgstat_fetch_stat_tabentry(Oid relid)
  * pgstat_fetch_stat_beentry() -
  *
  *     Support function for the SQL-callable pgstat* functions. Returns
- *     the actual activity slot of one active backend. The caller is
- *     responsible for a check if the actual user is permitted to see
- *     that info (especially the querystring).
+ *     our local copy of the current-activity entry for one backend.
+ *
+ *     NB: caller is responsible for a check if the user is permitted to see
+ *     this info (especially the querystring).
  * ----------
  */
-PgStat_StatBeEntry *
+PgBackendStatus *
 pgstat_fetch_stat_beentry(int beid)
 {
-       backend_read_statsfile();
+       pgstat_read_current_status();
 
-       if (beid < 1 || beid > pgStatNumBackends)
+       if (beid < 1 || beid > localNumBackends)
                return NULL;
 
-       return &pgStatBeTable[beid - 1];
+       return &localBackendStatusTable[beid - 1];
 }
 
 
-/* ----------
- * pgstat_fetch_stat_numbackends() -
- *
- *     Support function for the SQL-callable pgstat* functions. Returns
- *     the maximum current backend id.
- * ----------
- */
-int
-pgstat_fetch_stat_numbackends(void)
-{
-       backend_read_statsfile();
+/* ----------
+ * pgstat_fetch_stat_numbackends() -
+ *
+ *     Support function for the SQL-callable pgstat* functions. Returns
+ *     the maximum current backend id.
+ * ----------
+ */
+int
+pgstat_fetch_stat_numbackends(void)
+{
+       pgstat_read_current_status();
+
+       return localNumBackends;
+}
+
+/*
+ * ---------
+ * pgstat_fetch_global() -
+ *
+ *  Support function for the SQL-callable pgstat* functions. Returns
+ *  a pointer to the global statistics struct.
+ * ---------
+ */
+PgStat_GlobalStats *
+pgstat_fetch_global(void)
+{
+       backend_read_statsfile();
+
+       return &globalStats;
+}
+
+
+/* ------------------------------------------------------------
+ * Functions for management of the shared-memory PgBackendStatus array
+ * ------------------------------------------------------------
+ */
+
+static PgBackendStatus *BackendStatusArray = NULL;
+static PgBackendStatus *MyBEEntry = NULL;
+
+
+/*
+ * Report shared-memory space needed by CreateSharedBackendStatus.
+ */
+Size
+BackendStatusShmemSize(void)
+{
+       Size            size;
+
+       size = mul_size(sizeof(PgBackendStatus), MaxBackends);
+       return size;
+}
+
+/*
+ * Initialize the shared status array during postmaster startup.
+ */
+void
+CreateSharedBackendStatus(void)
+{
+       Size            size = BackendStatusShmemSize();
+       bool            found;
+
+       /* Create or attach to the shared array */
+       BackendStatusArray = (PgBackendStatus *)
+               ShmemInitStruct("Backend Status Array", size, &found);
+
+       if (!found)
+       {
+               /*
+                * We're the first - initialize.
+                */
+               MemSet(BackendStatusArray, 0, size);
+       }
+}
+
+
+/* ----------
+ * pgstat_bestart() -
+ *
+ *     Initialize this backend's entry in the PgBackendStatus array,
+ *     and set up an on-proc-exit hook that will clear it again.
+ *     Called from InitPostgres.  MyBackendId and MyDatabaseId must be set.
+ * ----------
+ */
+void
+pgstat_bestart(void)
+{
+       volatile PgBackendStatus *beentry;
+       TimestampTz proc_start_timestamp;
+       Oid                     userid;
+       SockAddr        clientaddr;
+
+       Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends);
+       MyBEEntry = &BackendStatusArray[MyBackendId - 1];
+
+       /*
+        * To minimize the time spent modifying the entry, fetch all the needed
+        * data first.
+        *
+        * If we have a MyProcPort, use its session start time (for consistency,
+        * and to save a kernel call).
+        */
+       if (MyProcPort)
+               proc_start_timestamp = MyProcPort->SessionStartTime;
+       else
+               proc_start_timestamp = GetCurrentTimestamp();
+       userid = GetSessionUserId();
+
+       /*
+        * We may not have a MyProcPort (eg, if this is the autovacuum process).
+        * If so, use all-zeroes client address, which is dealt with specially in
+        * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port.
+        */
+       if (MyProcPort)
+               memcpy(&clientaddr, &MyProcPort->raddr, sizeof(clientaddr));
+       else
+               MemSet(&clientaddr, 0, sizeof(clientaddr));
+
+       /*
+        * Initialize my status entry, following the protocol of bumping
+        * st_changecount before and after; and make sure it's even afterwards. We
+        * use a volatile pointer here to ensure the compiler doesn't try to get
+        * cute.
+        */
+       beentry = MyBEEntry;
+       do
+       {
+               beentry->st_changecount++;
+       } while ((beentry->st_changecount & 1) == 0);
+
+       beentry->st_procpid = MyProcPid;
+       beentry->st_proc_start_timestamp = proc_start_timestamp;
+       beentry->st_activity_start_timestamp = 0;
+       beentry->st_txn_start_timestamp = 0;
+       beentry->st_databaseid = MyDatabaseId;
+       beentry->st_userid = userid;
+       beentry->st_clientaddr = clientaddr;
+       beentry->st_waiting = false;
+       beentry->st_activity[0] = '\0';
+       /* Also make sure the last byte in the string area is always 0 */
+       beentry->st_activity[PGBE_ACTIVITY_SIZE - 1] = '\0';
+
+       beentry->st_changecount++;
+       Assert((beentry->st_changecount & 1) == 0);
+
+       /*
+        * Set up a process-exit hook to clean up.
+        */
+       on_shmem_exit(pgstat_beshutdown_hook, 0);
+}
+
+/*
+ * Shut down a single backend's statistics reporting at process exit.
+ *
+ * Flush any remaining statistics counts out to the collector.
+ * Without this, operations triggered during backend exit (such as
+ * temp table deletions) won't be counted.
+ *
+ * Lastly, clear out our entry in the PgBackendStatus array.
+ */
+static void
+pgstat_beshutdown_hook(int code, Datum arg)
+{
+       volatile PgBackendStatus *beentry = MyBEEntry;
+
+       pgstat_report_tabstat(true);
+
+       /*
+        * Clear my status entry, following the protocol of bumping st_changecount
+        * before and after.  We use a volatile pointer here to ensure the
+        * compiler doesn't try to get cute.
+        */
+       beentry->st_changecount++;
+
+       beentry->st_procpid = 0;        /* mark invalid */
+
+       beentry->st_changecount++;
+       Assert((beentry->st_changecount & 1) == 0);
+}
+
+
+/* ----------
+ * pgstat_report_activity() -
+ *
+ *     Called from tcop/postgres.c to report what the backend is actually doing
+ *     (usually "<IDLE>" or the start of the query to be executed).
+ * ----------
+ */
+void
+pgstat_report_activity(const char *cmd_str)
+{
+       volatile PgBackendStatus *beentry = MyBEEntry;
+       TimestampTz start_timestamp;
+       int                     len;
+
+       if (!pgstat_collect_querystring || !beentry)
+               return;
+
+       /*
+        * To minimize the time spent modifying the entry, fetch all the needed
+        * data first.
+        */
+       start_timestamp = GetCurrentStatementStartTimestamp();
+
+       len = strlen(cmd_str);
+       len = pg_mbcliplen(cmd_str, len, PGBE_ACTIVITY_SIZE - 1);
+
+       /*
+        * Update my status entry, following the protocol of bumping
+        * st_changecount before and after.  We use a volatile pointer here to
+        * ensure the compiler doesn't try to get cute.
+        */
+       beentry->st_changecount++;
+
+       beentry->st_activity_start_timestamp = start_timestamp;
+       memcpy((char *) beentry->st_activity, cmd_str, len);
+       beentry->st_activity[len] = '\0';
+
+       beentry->st_changecount++;
+       Assert((beentry->st_changecount & 1) == 0);
+}
+
+/*
+ * Set the current transaction start timestamp to the specified
+ * value. If there is no current active transaction, this is signified
+ * by 0.
+ */
+void
+pgstat_report_txn_timestamp(TimestampTz tstamp)
+{
+       volatile PgBackendStatus *beentry = MyBEEntry;
+
+       if (!pgstat_collect_querystring || !beentry)
+               return;
+
+       /*
+        * Update my status entry, following the protocol of bumping
+        * st_changecount before and after.  We use a volatile pointer
+        * here to ensure the compiler doesn't try to get cute.
+        */
+       beentry->st_changecount++;
+       beentry->st_txn_start_timestamp = tstamp;
+       beentry->st_changecount++;
+       Assert((beentry->st_changecount & 1) == 0);
+}
+
+/* ----------
+ * pgstat_report_waiting() -
+ *
+ *     Called from lock manager to report beginning or end of a lock wait.
+ *
+ * NB: this *must* be able to survive being called before MyBEEntry has been
+ * initialized.
+ * ----------
+ */
+void
+pgstat_report_waiting(bool waiting)
+{
+       volatile PgBackendStatus *beentry = MyBEEntry;
+
+       if (!pgstat_collect_querystring || !beentry)
+               return;
+
+       /*
+        * Since this is a single-byte field in a struct that only this process
+        * may modify, there seems no need to bother with the st_changecount
+        * protocol.  The update must appear atomic in any case.
+        */
+       beentry->st_waiting = waiting;
+}
+
+
+/* ----------
+ * pgstat_read_current_status() -
+ *
+ *     Copy the current contents of the PgBackendStatus array to local memory,
+ *     if not already done in this transaction.
+ * ----------
+ */
+static void
+pgstat_read_current_status(void)
+{
+       volatile PgBackendStatus *beentry;
+       PgBackendStatus *localtable;
+       PgBackendStatus *localentry;
+       int                     i;
+
+       Assert(!pgStatRunningInCollector);
+       if (localBackendStatusTable)
+               return;                                 /* already done */
+
+       pgstat_setup_memcxt();
+
+       localtable = (PgBackendStatus *)
+               MemoryContextAlloc(pgStatLocalContext,
+                                                  sizeof(PgBackendStatus) * MaxBackends);
+       localNumBackends = 0;
+
+       beentry = BackendStatusArray;
+       localentry = localtable;
+       for (i = 1; i <= MaxBackends; i++)
+       {
+               /*
+                * Follow the protocol of retrying if st_changecount changes while we
+                * copy the entry, or if it's odd.  (The check for odd is needed to
+                * cover the case where we are able to completely copy the entry while
+                * the source backend is between increment steps.)      We use a volatile
+                * pointer here to ensure the compiler doesn't try to get cute.
+                */
+               for (;;)
+               {
+                       int                     save_changecount = beentry->st_changecount;
+
+                       /*
+                        * XXX if PGBE_ACTIVITY_SIZE is really large, it might be best to
+                        * use strcpy not memcpy for copying the activity string?
+                        */
+                       memcpy(localentry, (char *) beentry, sizeof(PgBackendStatus));
+
+                       if (save_changecount == beentry->st_changecount &&
+                               (save_changecount & 1) == 0)
+                               break;
+
+                       /* Make sure we can break out of loop if stuck... */
+                       CHECK_FOR_INTERRUPTS();
+               }
+
+               beentry++;
+               /* Only valid entries get included into the local array */
+               if (localentry->st_procpid > 0)
+               {
+                       localentry++;
+                       localNumBackends++;
+               }
+       }
 
-       return pgStatNumBackends;
+       /* Set the pointer only after completion of a valid table */
+       localBackendStatusTable = localtable;
 }
 
 
-
 /* ------------------------------------------------------------
  * Local support functions follow
  * ------------------------------------------------------------
@@ -1425,8 +2052,6 @@ static void
 pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype)
 {
        hdr->m_type = mtype;
-       hdr->m_backendid = MyBackendId;
-       hdr->m_procpid = MyProcPid;
 }
 
 
@@ -1439,109 +2064,64 @@ pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype)
 static void
 pgstat_send(void *msg, int len)
 {
+       int                     rc;
+
        if (pgStatSock < 0)
                return;
 
        ((PgStat_MsgHdr *) msg)->m_size = len;
 
+       /* We'll retry after EINTR, but ignore all other failures */
+       do
+       {
+               rc = send(pgStatSock, msg, len, 0);
+       } while (rc < 0 && errno == EINTR);
+
 #ifdef USE_ASSERT_CHECKING
-       if (send(pgStatSock, msg, len, 0) < 0)
+       /* In debug builds, log send failures ... */
+       if (rc < 0)
                elog(LOG, "could not send to statistics collector: %m");
-#else
-       send(pgStatSock, msg, len, 0);
-       /* We deliberately ignore any error from send() */
 #endif
 }
 
-
 /* ----------
- * PgstatBufferMain() -
- *
- *     Start up the statistics buffer process.  This is the body of the
- *     postmaster child process.
+ * pgstat_send_bgwriter() -
  *
- *     The argc/argv parameters are valid only in EXEC_BACKEND case.
+ *      Send bgwriter statistics to the collector
  * ----------
  */
-NON_EXEC_STATIC void
-PgstatBufferMain(int argc, char *argv[])
+void
+pgstat_send_bgwriter(void)
 {
-       IsUnderPostmaster = true;       /* we are a postmaster subprocess now */
-
-       MyProcPid = getpid();           /* reset MyProcPid */
+       /* We assume this initializes to zeroes */
+       static const PgStat_MsgBgWriter all_zeroes;
 
        /*
-        * Ignore all signals usually bound to some action in the postmaster,
-        * except for SIGCHLD and SIGQUIT --- see pgstat_recvbuffer.
+        * This function can be called even if nothing at all has happened.
+        * In this case, avoid sending a completely empty message to
+        * the stats collector.
         */
-       pqsignal(SIGHUP, SIG_IGN);
-       pqsignal(SIGINT, SIG_IGN);
-       pqsignal(SIGTERM, SIG_IGN);
-       pqsignal(SIGQUIT, pgstat_exit);
-       pqsignal(SIGALRM, SIG_IGN);
-       pqsignal(SIGPIPE, SIG_IGN);
-       pqsignal(SIGUSR1, SIG_IGN);
-       pqsignal(SIGUSR2, SIG_IGN);
-       pqsignal(SIGCHLD, pgstat_die);
-       pqsignal(SIGTTIN, SIG_DFL);
-       pqsignal(SIGTTOU, SIG_DFL);
-       pqsignal(SIGCONT, SIG_DFL);
-       pqsignal(SIGWINCH, SIG_DFL);
-       /* unblock will happen in pgstat_recvbuffer */
-
-#ifdef EXEC_BACKEND
-       pgstat_parseArgs(argc, argv);
-#endif
+       if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
+               return;
 
        /*
-        * Start a buffering process to read from the socket, so we have a little
-        * more time to process incoming messages.
-        *
-        * NOTE: the process structure is: postmaster is parent of buffer process
-        * is parent of collector process.      This way, the buffer can detect
-        * collector failure via SIGCHLD, whereas otherwise it wouldn't notice
-        * collector failure until it tried to write on the pipe.  That would mean
-        * that after the postmaster started a new collector, we'd have two buffer
-        * processes competing to read from the UDP socket --- not good.
-        */
-       if (pgpipe(pgStatPipe) < 0)
-               ereport(ERROR,
-                               (errcode_for_socket_access(),
-                                errmsg("could not create pipe for statistics buffer: %m")));
-
-       /* child becomes collector process */
-#ifdef EXEC_BACKEND
-       pgStatCollectorPid = pgstat_forkexec(STAT_PROC_COLLECTOR);
-#else
-       pgStatCollectorPid = fork();
-#endif
-       switch (pgStatCollectorPid)
-       {
-               case -1:
-                       ereport(ERROR,
-                                       (errmsg("could not fork statistics collector: %m")));
-
-#ifndef EXEC_BACKEND
-               case 0:
-                       /* child becomes collector process */
-                       PgstatCollectorMain(0, NULL);
-                       break;
-#endif
+        * Prepare and send the message
+        */
+       pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
+       pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
 
-               default:
-                       /* parent becomes buffer process */
-                       closesocket(pgStatPipe[0]);
-                       pgstat_recvbuffer();
-       }
-       exit(0);
+       /*
+        * Clear out the statistics buffer, so it can be re-used.
+        */
+       MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
 }
 
 
 /* ----------
  * PgstatCollectorMain() -
  *
- *     Start up the statistics collector itself.  This is the body of the
- *     postmaster grandchild process.
+ *     Start up the statistics collector process.      This is the body of the
+ *     postmaster child process.
  *
  *     The argc/argv parameters are valid only in EXEC_BACKEND case.
  * ----------
@@ -1549,30 +2129,43 @@ PgstatBufferMain(int argc, char *argv[])
 NON_EXEC_STATIC void
 PgstatCollectorMain(int argc, char *argv[])
 {
+       struct itimerval write_timeout;
+       bool            need_timer = false;
+       int                     len;
        PgStat_Msg      msg;
+
+#ifndef WIN32
+#ifdef HAVE_POLL
+       struct pollfd input_fd;
+#else
+       struct timeval sel_timeout;
        fd_set          rfds;
-       int                     readPipe;
-       int                     len = 0;
-       struct itimerval timeout;
-       bool            need_timer = false;
+#endif
+#endif
+
+       IsUnderPostmaster = true;       /* we are a postmaster subprocess now */
 
        MyProcPid = getpid();           /* reset MyProcPid */
 
        /*
-        * Reset signal handling.  With the exception of restoring default SIGCHLD
-        * and SIGQUIT handling, this is a no-op in the non-EXEC_BACKEND case
-        * because we'll have inherited these settings from the buffer process;
-        * but it's not a no-op for EXEC_BACKEND.
+        * If possible, make this process a group leader, so that the postmaster
+        * can signal any child processes too.  (pgstat probably never has
+        * any child processes, but for consistency we make all postmaster
+        * child processes do this.)
+        */
+#ifdef HAVE_SETSID
+       if (setsid() < 0)
+               elog(FATAL, "setsid() failed: %m");
+#endif
+
+       /*
+        * Ignore all signals usually bound to some action in the postmaster,
+        * except SIGQUIT and SIGALRM.
         */
        pqsignal(SIGHUP, SIG_IGN);
        pqsignal(SIGINT, SIG_IGN);
        pqsignal(SIGTERM, SIG_IGN);
-#ifndef WIN32
-       pqsignal(SIGQUIT, SIG_IGN);
-#else
-       /* kluge to allow buffer process to kill collector; FIXME */
        pqsignal(SIGQUIT, pgstat_exit);
-#endif
        pqsignal(SIGALRM, force_statwrite);
        pqsignal(SIGPIPE, SIG_IGN);
        pqsignal(SIGUSR1, SIG_IGN);
@@ -1584,19 +2177,10 @@ PgstatCollectorMain(int argc, char *argv[])
        pqsignal(SIGWINCH, SIG_DFL);
        PG_SETMASK(&UnBlockSig);
 
-#ifdef EXEC_BACKEND
-       pgstat_parseArgs(argc, argv);
-#endif
-
-       /* Close unwanted files */
-       closesocket(pgStatPipe[1]);
-       closesocket(pgStatSock);
-
        /*
         * Identify myself via ps
         */
-       init_ps_display("stats collector process", "", "");
-       set_ps_display("");
+       init_ps_display("stats collector process", "", "", "");
 
        /*
         * Arrange to write the initial status file right away
@@ -1604,54 +2188,99 @@ PgstatCollectorMain(int argc, char *argv[])
        need_statwrite = true;
 
        /* Preset the delay between status file writes */
-       MemSet(&timeout, 0, sizeof(struct itimerval));
-       timeout.it_value.tv_sec = PGSTAT_STAT_INTERVAL / 1000;
-       timeout.it_value.tv_usec = PGSTAT_STAT_INTERVAL % 1000;
+       MemSet(&write_timeout, 0, sizeof(struct itimerval));
+       write_timeout.it_value.tv_sec = PGSTAT_STAT_INTERVAL / 1000;
+       write_timeout.it_value.tv_usec = (PGSTAT_STAT_INTERVAL % 1000) * 1000;
 
        /*
         * Read in an existing statistics stats file or initialize the stats to
         * zero.
         */
        pgStatRunningInCollector = true;
-       pgstat_read_statsfile(&pgStatDBHash, InvalidOid, NULL, NULL);
+       pgStatDBHash = pgstat_read_statsfile(InvalidOid);
 
        /*
-        * Create the known backends table
+        * Setup the descriptor set for select(2).      Since only one bit in the set
+        * ever changes, we need not repeat FD_ZERO each time.
         */
-       pgStatBeTable = (PgStat_StatBeEntry *)
-               palloc0(sizeof(PgStat_StatBeEntry) * MaxBackends);
-
-       readPipe = pgStatPipe[0];
+#if !defined(HAVE_POLL) && !defined(WIN32)
+       FD_ZERO(&rfds);
+#endif
 
        /*
-        * Process incoming messages and handle all the reporting stuff until
-        * there are no more messages.
+        * Loop to process messages until we get SIGQUIT or detect ungraceful
+        * death of our parent postmaster.
+        *
+        * For performance reasons, we don't want to do a PostmasterIsAlive() test
+        * after every message; instead, do it at statwrite time and if
+        * select()/poll() is interrupted by timeout.
         */
        for (;;)
        {
+               int                     got_data;
+
+               /*
+                * Quit if we get SIGQUIT from the postmaster.
+                */
+               if (need_exit)
+                       break;
+
                /*
-                * If time to write the stats file, do so.  Note that the alarm
+                * If time to write the stats file, do so.      Note that the alarm
                 * interrupt isn't re-enabled immediately, but only after we next
                 * receive a stats message; so no cycles are wasted when there is
                 * nothing going on.
                 */
                if (need_statwrite)
                {
+                       /* Check for postmaster death; if so we'll write file below */
+                       if (!PostmasterIsAlive(true))
+                               break;
+
                        pgstat_write_statsfile();
                        need_statwrite = false;
                        need_timer = true;
                }
 
                /*
-                * Setup the descriptor set for select(2)
+                * Wait for a message to arrive; but not for more than
+                * PGSTAT_SELECT_TIMEOUT seconds. (This determines how quickly we will
+                * shut down after an ungraceful postmaster termination; so it needn't
+                * be very fast.  However, on some systems SIGQUIT won't interrupt the
+                * poll/select call, so this also limits speed of response to SIGQUIT,
+                * which is more important.)
+                *
+                * We use poll(2) if available, otherwise select(2).
+                * Win32 has its own implementation.
                 */
-               FD_ZERO(&rfds);
-               FD_SET(readPipe, &rfds);
+#ifndef WIN32
+#ifdef HAVE_POLL
+               input_fd.fd = pgStatSock;
+               input_fd.events = POLLIN | POLLERR;
+               input_fd.revents = 0;
+
+               if (poll(&input_fd, 1, PGSTAT_SELECT_TIMEOUT * 1000) < 0)
+               {
+                       if (errno == EINTR)
+                               continue;
+                       ereport(ERROR,
+                                       (errcode_for_socket_access(),
+                                        errmsg("poll() failed in statistics collector: %m")));
+               }
+
+               got_data = (input_fd.revents != 0);
+#else                                                  /* !HAVE_POLL */
+
+               FD_SET(pgStatSock, &rfds);
 
                /*
-                * Now wait for something to do.
+                * timeout struct is modified by select() on some operating systems,
+                * so re-fill it each time.
                 */
-               if (select(readPipe + 1, &rfds, NULL, NULL, NULL) < 0)
+               sel_timeout.tv_sec = PGSTAT_SELECT_TIMEOUT;
+               sel_timeout.tv_usec = 0;
+
+               if (select(pgStatSock + 1, &rfds, NULL, NULL, &sel_timeout) < 0)
                {
                        if (errno == EINTR)
                                continue;
@@ -1660,472 +2289,135 @@ PgstatCollectorMain(int argc, char *argv[])
                                         errmsg("select() failed in statistics collector: %m")));
                }
 
+               got_data = FD_ISSET(pgStatSock, &rfds);
+#endif   /* HAVE_POLL */
+#else /* WIN32 */
+               got_data = pgwin32_waitforsinglesocket(pgStatSock, FD_READ,
+                                                                                          PGSTAT_SELECT_TIMEOUT*1000);
+#endif
+
                /*
-                * Check if there is a new statistics message to collect.
+                * If there is a message on the socket, read it and check for
+                * validity.
                 */
-               if (FD_ISSET(readPipe, &rfds))
+               if (got_data)
                {
-                       /*
-                        * We may need to issue multiple read calls in case the buffer
-                        * process didn't write the message in a single write, which is
-                        * possible since it dumps its buffer bytewise. In any case, we'd
-                        * need two reads since we don't know the message length
-                        * initially.
-                        */
-                       int                     nread = 0;
-                       int                     targetlen = sizeof(PgStat_MsgHdr);              /* initial */
-                       bool            pipeEOF = false;
-
-                       while (nread < targetlen)
+                       len = recv(pgStatSock, (char *) &msg,
+                                          sizeof(PgStat_Msg), 0);
+                       if (len < 0)
                        {
-                               len = piperead(readPipe, ((char *) &msg) + nread,
-                                                          targetlen - nread);
-                               if (len < 0)
-                               {
-                                       if (errno == EINTR)
-                                               continue;
-                                       ereport(ERROR,
-                                                       (errcode_for_socket_access(),
-                                                        errmsg("could not read from statistics collector pipe: %m")));
-                               }
-                               if (len == 0)   /* EOF on the pipe! */
-                               {
-                                       pipeEOF = true;
-                                       break;
-                               }
-                               nread += len;
-                               if (nread == sizeof(PgStat_MsgHdr))
-                               {
-                                       /* we have the header, compute actual msg length */
-                                       targetlen = msg.msg_hdr.m_size;
-                                       if (targetlen < (int) sizeof(PgStat_MsgHdr) ||
-                                               targetlen > (int) sizeof(msg))
-                                       {
-                                               /*
-                                                * Bogus message length implies that we got out of
-                                                * sync with the buffer process somehow. Abort so that
-                                                * we can restart both processes.
-                                                */
-                                               ereport(ERROR,
-                                                         (errmsg("invalid statistics message length")));
-                                       }
-                               }
+                               if (errno == EINTR)
+                                       continue;
+                               ereport(ERROR,
+                                               (errcode_for_socket_access(),
+                                                errmsg("could not read statistics message: %m")));
                        }
 
                        /*
-                        * EOF on the pipe implies that the buffer process exited. Fall
-                        * out of outer loop.
+                        * We ignore messages that are smaller than our common header
                         */
-                       if (pipeEOF)
-                               break;
+                       if (len < sizeof(PgStat_MsgHdr))
+                               continue;
+
+                       /*
+                        * The received length must match the length in the header
+                        */
+                       if (msg.msg_hdr.m_size != len)
+                               continue;
 
                        /*
-                        * Distribute the message to the specific function handling it.
+                        * O.K. - we accept this message.  Process it.
                         */
                        switch (msg.msg_hdr.m_type)
                        {
                                case PGSTAT_MTYPE_DUMMY:
                                        break;
 
-                               case PGSTAT_MTYPE_BESTART:
-                                       pgstat_recv_bestart((PgStat_MsgBestart *) &msg, nread);
-                                       break;
-
-                               case PGSTAT_MTYPE_BETERM:
-                                       pgstat_recv_beterm((PgStat_MsgBeterm *) &msg, nread);
-                                       break;
-
                                case PGSTAT_MTYPE_TABSTAT:
-                                       pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, nread);
+                                       pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, len);
                                        break;
 
                                case PGSTAT_MTYPE_TABPURGE:
-                                       pgstat_recv_tabpurge((PgStat_MsgTabpurge *) &msg, nread);
-                                       break;
-
-                               case PGSTAT_MTYPE_ACTIVITY:
-                                       pgstat_recv_activity((PgStat_MsgActivity *) &msg, nread);
+                                       pgstat_recv_tabpurge((PgStat_MsgTabpurge *) &msg, len);
                                        break;
 
                                case PGSTAT_MTYPE_DROPDB:
-                                       pgstat_recv_dropdb((PgStat_MsgDropdb *) &msg, nread);
+                                       pgstat_recv_dropdb((PgStat_MsgDropdb *) &msg, len);
                                        break;
 
                                case PGSTAT_MTYPE_RESETCOUNTER:
                                        pgstat_recv_resetcounter((PgStat_MsgResetcounter *) &msg,
-                                                                                        nread);
+                                                                                        len);
                                        break;
 
                                case PGSTAT_MTYPE_AUTOVAC_START:
-                                       pgstat_recv_autovac((PgStat_MsgAutovacStart *) &msg, nread);
+                                       pgstat_recv_autovac((PgStat_MsgAutovacStart *) &msg, len);
                                        break;
 
                                case PGSTAT_MTYPE_VACUUM:
-                                       pgstat_recv_vacuum((PgStat_MsgVacuum *) &msg, nread);
-                                       break;
-
-                               case PGSTAT_MTYPE_ANALYZE:
-                                       pgstat_recv_analyze((PgStat_MsgAnalyze *) &msg, nread);
+                                       pgstat_recv_vacuum((PgStat_MsgVacuum *) &msg, len);
                                        break;
 
-                               default:
-                                       break;
-                       }
-
-                       /*
-                        * Globally count messages.
-                        */
-                       pgStatNumMessages++;
-
-                       /*
-                        * If this is the first message after we wrote the stats file the
-                        * last time, enable the alarm interrupt to make it be written
-                        * again later.
-                        */
-                       if (need_timer)
-                       {
-                               if (setitimer(ITIMER_REAL, &timeout, NULL))
-                                       ereport(ERROR,
-                                                 (errmsg("could not set statistics collector timer: %m")));
-                               need_timer = false;
-                       }
-               }
-
-               /*
-                * Note that we do NOT check for postmaster exit inside the loop; only
-                * EOF on the buffer pipe causes us to fall out.  This ensures we
-                * don't exit prematurely if there are still a few messages in the
-                * buffer or pipe at postmaster shutdown.
-                */
-       }
-
-       /*
-        * Okay, we saw EOF on the buffer pipe, so there are no more messages to
-        * process.  If the buffer process quit because of postmaster shutdown, we
-        * want to save the final stats to reuse at next startup. But if the
-        * buffer process failed, it seems best not to (there may even now be a
-        * new collector firing up, and we don't want it to read a
-        * partially-rewritten stats file).
-        */
-       if (!PostmasterIsAlive(false))
-               pgstat_write_statsfile();
-}
-
-
-/* SIGALRM signal handler for collector process */
-static void
-force_statwrite(SIGNAL_ARGS)
-{
-       need_statwrite = true;
-}
-
-
-/* ----------
- * pgstat_recvbuffer() -
- *
- *     This is the body of the separate buffering process. Its only
- *     purpose is to receive messages from the UDP socket as fast as
- *     possible and forward them over a pipe into the collector itself.
- *     If the collector is slow to absorb messages, they are buffered here.
- * ----------
- */
-static void
-pgstat_recvbuffer(void)
-{
-       fd_set          rfds;
-       fd_set          wfds;
-       struct timeval timeout;
-       int                     writePipe = pgStatPipe[1];
-       int                     maxfd;
-       int                     len;
-       int                     xfr;
-       int                     frm;
-       PgStat_Msg      input_buffer;
-       char       *msgbuffer;
-       int                     msg_send = 0;   /* next send index in buffer */
-       int                     msg_recv = 0;   /* next receive index */
-       int                     msg_have = 0;   /* number of bytes stored */
-       bool            overflow = false;
-
-       /*
-        * Identify myself via ps
-        */
-       init_ps_display("stats buffer process", "", "");
-       set_ps_display("");
-
-       /*
-        * We want to die if our child collector process does.  There are two ways
-        * we might notice that it has died: receive SIGCHLD, or get a write
-        * failure on the pipe leading to the child.  We can set SIGPIPE to kill
-        * us here.  Our SIGCHLD handler was already set up before we forked (must
-        * do it that way, else it's a race condition).
-        */
-       pqsignal(SIGPIPE, SIG_DFL);
-       PG_SETMASK(&UnBlockSig);
-
-       /*
-        * Set the write pipe to nonblock mode, so that we cannot block when the
-        * collector falls behind.
-        */
-       if (!pg_set_noblock(writePipe))
-               ereport(ERROR,
-                               (errcode_for_socket_access(),
-                                errmsg("could not set statistics collector pipe to nonblocking mode: %m")));
-
-       /*
-        * Allocate the message buffer
-        */
-       msgbuffer = (char *) palloc(PGSTAT_RECVBUFFERSZ);
-
-       /*
-        * Loop forever
-        */
-       for (;;)
-       {
-               FD_ZERO(&rfds);
-               FD_ZERO(&wfds);
-               maxfd = -1;
-
-               /*
-                * As long as we have buffer space we add the socket to the read
-                * descriptor set.
-                */
-               if (msg_have <= (int) (PGSTAT_RECVBUFFERSZ - sizeof(PgStat_Msg)))
-               {
-                       FD_SET(pgStatSock, &rfds);
-                       maxfd = pgStatSock;
-                       overflow = false;
-               }
-               else
-               {
-                       if (!overflow)
-                       {
-                               ereport(LOG,
-                                               (errmsg("statistics buffer is full")));
-                               overflow = true;
-                       }
-               }
-
-               /*
-                * If we have messages to write out, we add the pipe to the write
-                * descriptor set.
-                */
-               if (msg_have > 0)
-               {
-                       FD_SET(writePipe, &wfds);
-                       if (writePipe > maxfd)
-                               maxfd = writePipe;
-               }
-
-               /*
-                * Wait for some work to do; but not for more than 10 seconds. (This
-                * determines how quickly we will shut down after an ungraceful
-                * postmaster termination; so it needn't be very fast.)
-                *
-                * struct timeout is modified by select() on some operating systems,
-                * so re-fill it each time.
-                */
-               timeout.tv_sec = 10;
-               timeout.tv_usec = 0;
-
-               if (select(maxfd + 1, &rfds, &wfds, NULL, &timeout) < 0)
-               {
-                       if (errno == EINTR)
-                               continue;
-                       ereport(ERROR,
-                                       (errcode_for_socket_access(),
-                                        errmsg("select() failed in statistics buffer: %m")));
-               }
-
-               /*
-                * If there is a message on the socket, read it and check for
-                * validity.
-                */
-               if (FD_ISSET(pgStatSock, &rfds))
-               {
-                       len = recv(pgStatSock, (char *) &input_buffer,
-                                          sizeof(PgStat_Msg), 0);
-                       if (len < 0)
-                               ereport(ERROR,
-                                               (errcode_for_socket_access(),
-                                                errmsg("could not read statistics message: %m")));
-
-                       /*
-                        * We ignore messages that are smaller than our common header
-                        */
-                       if (len < sizeof(PgStat_MsgHdr))
-                               continue;
+                               case PGSTAT_MTYPE_ANALYZE:
+                                       pgstat_recv_analyze((PgStat_MsgAnalyze *) &msg, len);
+                                       break;
 
-                       /*
-                        * The received length must match the length in the header
-                        */
-                       if (input_buffer.msg_hdr.m_size != len)
-                               continue;
+                               case PGSTAT_MTYPE_BGWRITER:
+                                       pgstat_recv_bgwriter((PgStat_MsgBgWriter *) &msg, len);
+                                       break;
+
+                               default:
+                                       break;
+                       }
 
                        /*
-                        * O.K. - we accept this message.  Copy it to the circular
-                        * msgbuffer.
+                        * If this is the first message after we wrote the stats file the
+                        * last time, enable the alarm interrupt to make it be written
+                        * again later.
                         */
-                       frm = 0;
-                       while (len > 0)
+                       if (need_timer)
                        {
-                               xfr = PGSTAT_RECVBUFFERSZ - msg_recv;
-                               if (xfr > len)
-                                       xfr = len;
-                               Assert(xfr > 0);
-                               memcpy(msgbuffer + msg_recv,
-                                          ((char *) &input_buffer) + frm,
-                                          xfr);
-                               msg_recv += xfr;
-                               if (msg_recv == PGSTAT_RECVBUFFERSZ)
-                                       msg_recv = 0;
-                               msg_have += xfr;
-                               frm += xfr;
-                               len -= xfr;
+                               if (setitimer(ITIMER_REAL, &write_timeout, NULL))
+                                       ereport(ERROR,
+                                       (errmsg("could not set statistics collector timer: %m")));
+                               need_timer = false;
                        }
                }
-
-               /*
-                * If the collector is ready to receive, write some data into his
-                * pipe.  We may or may not be able to write all that we have.
-                *
-                * NOTE: if what we have is less than PIPE_BUF bytes but more than the
-                * space available in the pipe buffer, most kernels will refuse to
-                * write any of it, and will return EAGAIN.  This means we will
-                * busy-loop until the situation changes (either because the collector
-                * caught up, or because more data arrives so that we have more than
-                * PIPE_BUF bytes buffered).  This is not good, but is there any way
-                * around it?  We have no way to tell when the collector has caught
-                * up...
-                */
-               if (FD_ISSET(writePipe, &wfds))
+               else
                {
-                       xfr = PGSTAT_RECVBUFFERSZ - msg_send;
-                       if (xfr > msg_have)
-                               xfr = msg_have;
-                       Assert(xfr > 0);
-                       len = pipewrite(writePipe, msgbuffer + msg_send, xfr);
-                       if (len < 0)
-                       {
-                               if (errno == EINTR || errno == EAGAIN)
-                                       continue;       /* not enough space in pipe */
-                               ereport(ERROR,
-                                               (errcode_for_socket_access(),
-                               errmsg("could not write to statistics collector pipe: %m")));
-                       }
-                       /* NB: len < xfr is okay */
-                       msg_send += len;
-                       if (msg_send == PGSTAT_RECVBUFFERSZ)
-                               msg_send = 0;
-                       msg_have -= len;
+                       /*
+                        * We can only get here if the select/poll timeout elapsed. Check
+                        * for postmaster death.
+                        */
+                       if (!PostmasterIsAlive(true))
+                               break;
                }
+       }                                                       /* end of message-processing loop */
 
-               /*
-                * Make sure we forwarded all messages before we check for postmaster
-                * termination.
-                */
-               if (msg_have != 0 || FD_ISSET(pgStatSock, &rfds))
-                       continue;
-
-               /*
-                * If the postmaster has terminated, we die too.  (This is no longer
-                * the normal exit path, however.)
-                */
-               if (!PostmasterIsAlive(true))
-                       exit(0);
-       }
-}
-
-/* SIGQUIT signal handler for buffer process */
-static void
-pgstat_exit(SIGNAL_ARGS)
-{
        /*
-        * For now, we just nail the doors shut and get out of town.  It might be
-        * cleaner to allow any pending messages to be sent, but that creates a
-        * tradeoff against speed of exit.
+        * Save the final stats to reuse at next startup.
         */
+       pgstat_write_statsfile();
 
-       /*
-        * If running in bufferer, kill our collector as well. On some broken
-        * win32 systems, it does not shut down automatically because of issues
-        * with socket inheritance.  XXX so why not fix the socket inheritance...
-        */
-#ifdef WIN32
-       if (pgStatCollectorPid > 0)
-               kill(pgStatCollectorPid, SIGQUIT);
-#endif
        exit(0);
 }
 
-/* SIGCHLD signal handler for buffer process */
+
+/* SIGQUIT signal handler for collector process */
 static void
-pgstat_die(SIGNAL_ARGS)
+pgstat_exit(SIGNAL_ARGS)
 {
-       exit(1);
+       need_exit = true;
 }
 
-
-/* ----------
- * pgstat_add_backend() -
- *
- *     Support function to keep our backend list up to date.
- * ----------
- */
-static int
-pgstat_add_backend(PgStat_MsgHdr *msg)
+/* SIGALRM signal handler for collector process */
+static void
+force_statwrite(SIGNAL_ARGS)
 {
-       PgStat_StatBeEntry *beentry;
-
-       /*
-        * Check that the backend ID is valid
-        */
-       if (msg->m_backendid < 1 || msg->m_backendid > MaxBackends)
-       {
-               ereport(LOG,
-                               (errmsg("invalid server process ID %d", msg->m_backendid)));
-               return -1;
-       }
-
-       /*
-        * Get the slot for this backendid.
-        */
-       beentry = &pgStatBeTable[msg->m_backendid - 1];
-
-       /*
-        * If the slot contains the PID of this backend, everything is fine and we
-        * have nothing to do. Note that all the slots are zero'd out when the
-        * collector is started. We assume that a slot is "empty" iff procpid ==
-        * 0.
-        */
-       if (beentry->procpid > 0 && beentry->procpid == msg->m_procpid)
-               return 0;
-
-       /* Must be able to distinguish between empty and non-empty slots */
-       Assert(msg->m_procpid > 0);
-
-       /*
-        * Put this new backend into the slot (possibly overwriting an old entry,
-        * if we missed its BETERM or the BETERM hasn't arrived yet).
-        */
-       beentry->procpid = msg->m_procpid;
-       beentry->start_timestamp = GetCurrentTimestamp();
-       beentry->activity_start_timestamp = 0;
-       beentry->activity[0] = '\0';
-
-       /*
-        * We can't initialize the rest of the data in this slot until we see the
-        * BESTART message. Therefore, we set the database and user to sentinel
-        * values, to indicate "undefined". There is no easy way to do this for
-        * the client address, so make sure to check that the database or user are
-        * defined before accessing the client address.
-        */
-       beentry->userid = InvalidOid;
-       beentry->databaseid = InvalidOid;
-
-       return 0;
+       need_statwrite = true;
 }
 
+
 /*
  * Lookup the hash table entry for the specified database. If no hash
  * table entry exists, initialize it, if the create parameter is true.
@@ -2156,6 +2448,11 @@ pgstat_get_db_entry(Oid databaseid, bool create)
                result->n_xact_rollback = 0;
                result->n_blocks_fetched = 0;
                result->n_blocks_hit = 0;
+               result->n_tuples_returned = 0;
+               result->n_tuples_fetched = 0;
+               result->n_tuples_inserted = 0;
+               result->n_tuples_updated = 0;
+               result->n_tuples_deleted = 0;
                result->last_autovac_time = 0;
 
                memset(&hash_ctl, 0, sizeof(hash_ctl));
@@ -2171,38 +2468,6 @@ pgstat_get_db_entry(Oid databaseid, bool create)
        return result;
 }
 
-/* ----------
- * pgstat_sub_backend() -
- *
- *     Remove a backend from the actual backends list.
- * ----------
- */
-static void
-pgstat_sub_backend(int procpid)
-{
-       int                     i;
-
-       /*
-        * Search in the known-backends table for the slot containing this PID.
-        */
-       for (i = 0; i < MaxBackends; i++)
-       {
-               if (pgStatBeTable[i].procpid == procpid)
-               {
-                       /*
-                        * That's him.  Mark the backend slot empty.
-                        */
-                       pgStatBeTable[i].procpid = 0;
-                       return;
-               }
-       }
-
-       /*
-        * No big problem if not found. This can happen if UDP messages arrive out
-        * of order here.
-        */
-}
-
 
 /* ----------
  * pgstat_write_statsfile() -
@@ -2218,7 +2483,6 @@ pgstat_write_statsfile(void)
        PgStat_StatDBEntry *dbentry;
        PgStat_StatTabEntry *tabentry;
        FILE       *fpout;
-       int                     i;
        int32           format_id;
 
        /*
@@ -2240,6 +2504,11 @@ pgstat_write_statsfile(void)
        format_id = PGSTAT_FILE_FORMAT_ID;
        fwrite(&format_id, sizeof(format_id), 1, fpout);
 
+       /*
+        * Write global stats struct
+        */
+       fwrite(&globalStats, sizeof(globalStats), 1, fpout);
+
        /*
         * Walk through the database table.
         */
@@ -2247,9 +2516,9 @@ pgstat_write_statsfile(void)
        while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
        {
                /*
-                * Write out the DB entry including the number of live backends.
-                * We don't write the tables pointer since it's of no use to any
-                * other process.
+                * Write out the DB entry including the number of live backends. We
+                * don't write the tables pointer since it's of no use to any other
+                * process.
                 */
                fputc('D', fpout);
                fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
@@ -2270,29 +2539,6 @@ pgstat_write_statsfile(void)
                fputc('d', fpout);
        }
 
-       /*
-        * Write out the known running backends to the stats file.
-        */
-       i = MaxBackends;
-       fputc('M', fpout);
-       fwrite(&i, sizeof(i), 1, fpout);
-
-       for (i = 0; i < MaxBackends; i++)
-       {
-               PgStat_StatBeEntry *beentry = &pgStatBeTable[i];
-
-               if (beentry->procpid > 0)
-               {
-                       int             len;
-
-                       len = offsetof(PgStat_StatBeEntry, activity) +
-                               strlen(beentry->activity) + 1;
-                       fputc('B', fpout);
-                       fwrite(&len, sizeof(len), 1, fpout);
-                       fwrite(beentry, len, 1, fpout);
-               }
-       }
-
        /*
         * No more output to be done. Close the temp file and replace the old
         * pgstat.stat with it.  The ferror() check replaces testing for error
@@ -2304,8 +2550,8 @@ pgstat_write_statsfile(void)
        {
                ereport(LOG,
                                (errcode_for_file_access(),
-                                errmsg("could not write temporary statistics file \"%s\": %m",
-                                               PGSTAT_STAT_TMPFILE)));
+                          errmsg("could not write temporary statistics file \"%s\": %m",
+                                         PGSTAT_STAT_TMPFILE)));
                fclose(fpout);
                unlink(PGSTAT_STAT_TMPFILE);
        }
@@ -2327,73 +2573,32 @@ pgstat_write_statsfile(void)
        }
 }
 
-/*
- * qsort/bsearch comparison routine for PIDs
- *
- * We assume PIDs are nonnegative, so there's no overflow risk
- */
-static int
-comparePids(const void *v1, const void *v2)
-{
-       return *((const int *) v1) - *((const int *) v2);
-}
 
 /* ----------
  * pgstat_read_statsfile() -
  *
- *     Reads in an existing statistics collector and initializes the
- *     databases' hash table (whose entries point to the tables' hash tables)
- *     and the current backend table.
+ *     Reads in an existing statistics collector file and initializes the
+ *     databases' hash table (whose entries point to the tables' hash tables).
  * ----------
  */
-static void
-pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
-                                         PgStat_StatBeEntry **betab, int *numbackends)
+static HTAB *
+pgstat_read_statsfile(Oid onlydb)
 {
        PgStat_StatDBEntry *dbentry;
        PgStat_StatDBEntry dbbuf;
        PgStat_StatTabEntry *tabentry;
        PgStat_StatTabEntry tabbuf;
-       PgStat_StatBeEntry *beentry;
        HASHCTL         hash_ctl;
+       HTAB       *dbhash;
        HTAB       *tabhash = NULL;
        FILE       *fpin;
        int32           format_id;
-       int                     len;
-       int                     maxbackends = 0;
-       int                     havebackends = 0;
        bool            found;
-       int                *live_pids;
-       MemoryContext use_mcxt;
-       int                     mcxt_flags;
 
        /*
-        * If running in the collector or the autovacuum process, we use the
-        * DynaHashCxt memory context.  If running in a backend, we use the
-        * TopTransactionContext instead, so the caller must only know the last
-        * XactId when this call happened to know if his tables are still valid or
-        * already gone!
-        *
-        * Also, if running in a regular backend, we check backend entries against
-        * the PGPROC array so that we can detect stale entries.  This lets us
-        * discard entries whose BETERM message got lost for some reason.
+        * The tables will live in pgStatLocalContext.
         */
-       if (pgStatRunningInCollector || IsAutoVacuumProcess())
-       {
-               use_mcxt = NULL;
-               mcxt_flags = 0;
-               live_pids = NULL;
-       }
-       else
-       {
-               use_mcxt = TopTransactionContext;
-               mcxt_flags = HASH_CONTEXT;
-               live_pids = GetAllBackendPids();
-               /* Sort the PID array so we can use bsearch */
-               if (live_pids[0] > 1)
-                       qsort((void *) &live_pids[1], live_pids[0], sizeof(int),
-                                 comparePids);
-       }
+       pgstat_setup_memcxt();
 
        /*
         * Create the DB hashtable
@@ -2402,18 +2607,15 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
        hash_ctl.keysize = sizeof(Oid);
        hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
        hash_ctl.hash = oid_hash;
-       hash_ctl.hcxt = use_mcxt;
-       *dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
-                                                 HASH_ELEM | HASH_FUNCTION | mcxt_flags);
+       hash_ctl.hcxt = pgStatLocalContext;
+       dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
+                                                HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 
        /*
-        * Initialize the number of known backends to zero, just in case we do a
-        * silent error return below.
+        * Clear out global statistics so they start from zero in case we can't
+        * load an existing statsfile.
         */
-       if (numbackends != NULL)
-               *numbackends = 0;
-       if (betab != NULL)
-               *betab = NULL;
+       memset(&globalStats, 0, sizeof(globalStats));
 
        /*
         * Try to open the status file. If it doesn't exist, the backends simply
@@ -2421,7 +2623,7 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
         * with empty counters.
         */
        if ((fpin = AllocateFile(PGSTAT_STAT_FILENAME, PG_BINARY_R)) == NULL)
-               return;
+               return dbhash;
 
        /*
         * Verify it's of the expected format.
@@ -2434,6 +2636,16 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
                goto done;
        }
 
+       /*
+        * Read global stats struct
+        */
+       if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
+       {
+               ereport(pgStatRunningInCollector ? LOG : WARNING,
+                               (errmsg("corrupted pgstat.stat file")));
+               goto done;
+       }
+
        /*
         * We found an existing collector stats file. Read it and put all the
         * hashtable entries into place.
@@ -2459,7 +2671,7 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
                                /*
                                 * Add to the DB hash
                                 */
-                               dbentry = (PgStat_StatDBEntry *) hash_search(*dbhash,
+                               dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
                                                                                                  (void *) &dbbuf.databaseid,
                                                                                                                         HASH_ENTER,
                                                                                                                         &found);
@@ -2472,7 +2684,6 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 
                                memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
                                dbentry->tables = NULL;
-                               dbentry->n_backends = 0;
 
                                /*
                                 * Don't collect tables if not the requested DB (or the
@@ -2489,11 +2700,11 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
                                hash_ctl.keysize = sizeof(Oid);
                                hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
                                hash_ctl.hash = oid_hash;
-                               hash_ctl.hcxt = use_mcxt;
+                               hash_ctl.hcxt = pgStatLocalContext;
                                dbentry->tables = hash_create("Per-database table",
                                                                                          PGSTAT_TAB_HASH_SIZE,
                                                                                          &hash_ctl,
-                                                                        HASH_ELEM | HASH_FUNCTION | mcxt_flags);
+                                                                        HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 
                                /*
                                 * Arrange that following 'T's add entries to this database's
@@ -2531,343 +2742,96 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
                                                                                                        (void *) &tabbuf.tableid,
                                                                                                                 HASH_ENTER, &found);
 
-                               if (found)
-                               {
-                                       ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                                       (errmsg("corrupted pgstat.stat file")));
-                                       goto done;
-                               }
-
-                               memcpy(tabentry, &tabbuf, sizeof(tabbuf));
-                               break;
-
-                               /*
-                                * 'M'  The maximum number of backends to expect follows.
-                                */
-                       case 'M':
-                               if (betab == NULL || numbackends == NULL)
-                                       goto done;
-                               if (fread(&maxbackends, 1, sizeof(maxbackends), fpin) !=
-                                       sizeof(maxbackends))
-                               {
-                                       ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                                       (errmsg("corrupted pgstat.stat file")));
-                                       goto done;
-                               }
-                               if (maxbackends == 0)
-                                       goto done;
-
-                               /*
-                                * Allocate space (in TopTransactionContext too) for the
-                                * backend table.
-                                */
-                               if (use_mcxt == NULL)
-                                       *betab = (PgStat_StatBeEntry *)
-                                               palloc(sizeof(PgStat_StatBeEntry) * maxbackends);
-                               else
-                                       *betab = (PgStat_StatBeEntry *)
-                                               MemoryContextAlloc(use_mcxt,
-                                                                  sizeof(PgStat_StatBeEntry) * maxbackends);
-                               break;
-
-                               /*
-                                * 'B'  A PgStat_StatBeEntry follows.
-                                */
-                       case 'B':
-                               if (betab == NULL || numbackends == NULL || *betab == NULL)
-                                       goto done;
-
-                               if (havebackends >= maxbackends)
-                                       goto done;
-
-                               /* Read and validate the entry length */
-                               if (fread(&len, 1, sizeof(len), fpin) != sizeof(len))
-                               {
-                                       ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                                       (errmsg("corrupted pgstat.stat file")));
-                                       goto done;
-                               }
-                               if (len <= offsetof(PgStat_StatBeEntry, activity) ||
-                                       len > sizeof(PgStat_StatBeEntry))
-                               {
-                                       ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                                       (errmsg("corrupted pgstat.stat file")));
-                                       goto done;
-                               }
-
-                               /*
-                                * Read it directly into the table.
-                                */
-                               beentry = &(*betab)[havebackends];
-
-                               if (fread(beentry, 1, len, fpin) != len)
-                               {
-                                       ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                                       (errmsg("corrupted pgstat.stat file")));
-                                       goto done;
-                               }
-
-                               /*
-                                * If possible, check PID to verify still running
-                                */
-                               if (live_pids &&
-                                       (live_pids[0] == 0 ||
-                                        bsearch((void *) &beentry->procpid,
-                                                        (void *) &live_pids[1],
-                                                        live_pids[0],
-                                                        sizeof(int),
-                                                        comparePids) == NULL))
-                               {
-                                       /*
-                                        * Note: we could send a BETERM message to tell the
-                                        * collector to drop the entry, but I'm a bit worried
-                                        * about race conditions.  For now, just silently ignore
-                                        * dead entries; they'll get recycled eventually anyway.
-                                        */
-
-                                       /* Don't accept the entry */
-                                       memset(beentry, 0, sizeof(PgStat_StatBeEntry));
-                                       break;
-                               }
-
-                               /*
-                                * Count backends per database here.
-                                */
-                               dbentry = (PgStat_StatDBEntry *)
-                                       hash_search(*dbhash,
-                                                               &(beentry->databaseid),
-                                                               HASH_FIND,
-                                                               NULL);
-                               if (dbentry)
-                                       dbentry->n_backends++;
-
-                               havebackends++;
-                               *numbackends = havebackends;
-
-                               break;
-
-                               /*
-                                * 'E'  The EOF marker of a complete stats file.
-                                */
-                       case 'E':
-                               goto done;
-
-                       default:
-                               ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                               (errmsg("corrupted pgstat.stat file")));
-                               goto done;
-               }
-       }
-
-done:
-       FreeFile(fpin);
-}
-
-/*
- * If not done for this transaction, read the statistics collector
- * stats file into some hash tables.
- *
- * Because we store the hash tables in TopTransactionContext, the result
- * is good for the entire current main transaction.
- *
- * Inside the autovacuum process, the statfile is assumed to be valid
- * "forever", that is one iteration, within one database.  This means
- * we only consider the statistics as they were when the autovacuum
- * iteration started.
- */
-static void
-backend_read_statsfile(void)
-{
-       if (IsAutoVacuumProcess())
-       {
-               /* already read it? */
-               if (pgStatDBHash)
-                       return;
-               Assert(!pgStatRunningInCollector);
-               pgstat_read_statsfile(&pgStatDBHash, InvalidOid,
-                                                         &pgStatBeTable, &pgStatNumBackends);
-       }
-       else
-       {
-               TransactionId topXid = GetTopTransactionId();
-
-               if (!TransactionIdEquals(pgStatDBHashXact, topXid))
-               {
-                       Assert(!pgStatRunningInCollector);
-                       pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
-                                                                 &pgStatBeTable, &pgStatNumBackends);
-                       pgStatDBHashXact = topXid;
-               }
-       }
-}
-
-
-/* ----------
- * pgstat_recv_bestart() -
- *
- *     Process a backend startup message.
- * ----------
- */
-static void
-pgstat_recv_bestart(PgStat_MsgBestart *msg, int len)
-{
-       PgStat_StatBeEntry *entry;
-
-       /*
-        * If the backend is known dead, we ignore the message -- we don't want to
-        * update the backend entry's state since this BESTART message refers to
-        * an old, dead backend
-        */
-       if (pgstat_add_backend(&msg->m_hdr) != 0)
-               return;
-
-       entry = &(pgStatBeTable[msg->m_hdr.m_backendid - 1]);
-       entry->userid = msg->m_userid;
-       memcpy(&entry->clientaddr, &msg->m_clientaddr, sizeof(entry->clientaddr));
-       entry->databaseid = msg->m_databaseid;
-}
+                               if (found)
+                               {
+                                       ereport(pgStatRunningInCollector ? LOG : WARNING,
+                                                       (errmsg("corrupted pgstat.stat file")));
+                                       goto done;
+                               }
 
+                               memcpy(tabentry, &tabbuf, sizeof(tabbuf));
+                               break;
 
-/* ----------
- * pgstat_recv_beterm() -
- *
- *     Process a backend termination message.
- * ----------
- */
-static void
-pgstat_recv_beterm(PgStat_MsgBeterm *msg, int len)
-{
-       pgstat_sub_backend(msg->m_hdr.m_procpid);
-}
+                               /*
+                                * 'E'  The EOF marker of a complete stats file.
+                                */
+                       case 'E':
+                               goto done;
 
-/* ----------
- * pgstat_recv_autovac() -
- *
- *     Process an autovacuum signalling message.
- * ----------
- */
-static void
-pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
-{
-       PgStat_StatDBEntry *dbentry;
+                       default:
+                               ereport(pgStatRunningInCollector ? LOG : WARNING,
+                                               (errmsg("corrupted pgstat.stat file")));
+                               goto done;
+               }
+       }
 
-       /*
-        * Lookup the database in the hashtable.  Don't create the entry if it
-        * doesn't exist, because autovacuum may be processing a template
-        * database.  If this isn't the case, the database is most likely to have
-        * an entry already.  (If it doesn't, not much harm is done anyway --
-        * it'll get created as soon as somebody actually uses the database.)
-        */
-       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-       if (dbentry == NULL)
-               return;
+done:
+       FreeFile(fpin);
 
-       /*
-        * Store the last autovacuum time in the database entry.
-        */
-       dbentry->last_autovac_time = msg->m_start_time;
+       return dbhash;
 }
 
-/* ----------
- * pgstat_recv_vacuum() -
- *
- *     Process a VACUUM message.
- * ----------
+/*
+ * If not already done, read the statistics collector stats file into
+ * some hash tables.  The results will be kept until pgstat_clear_snapshot()
+ * is called (typically, at end of transaction).
  */
 static void
-pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
+backend_read_statsfile(void)
 {
-       PgStat_StatDBEntry *dbentry;
-       PgStat_StatTabEntry *tabentry;
-
-       /*
-        * Don't create either the database or table entry if it doesn't already
-        * exist.  This avoids bloating the stats with entries for stuff that is
-        * only touched by vacuum and not by live operations.
-        */
-       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-       if (dbentry == NULL)
-               return;
-
-       tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
-                                                  HASH_FIND, NULL);
-       if (tabentry == NULL)
+       /* already read it? */
+       if (pgStatDBHash)
                return;
+       Assert(!pgStatRunningInCollector);
 
-       if (msg->m_autovacuum) 
-               tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime;
-       else 
-               tabentry->vacuum_timestamp = msg->m_vacuumtime; 
-       tabentry->n_live_tuples = msg->m_tuples;
-       tabentry->n_dead_tuples = 0;
-       if (msg->m_analyze)
-       {
-               tabentry->last_anl_tuples = msg->m_tuples;
-               if (msg->m_autovacuum)
-                       tabentry->autovac_analyze_timestamp = msg->m_vacuumtime;
-               else
-                       tabentry->analyze_timestamp = msg->m_vacuumtime;
-       }
+       /* Autovacuum launcher wants stats about all databases */
+       if (IsAutoVacuumLauncherProcess())
+               pgStatDBHash = pgstat_read_statsfile(InvalidOid);
+       else
+               pgStatDBHash = pgstat_read_statsfile(MyDatabaseId);
 }
 
+
 /* ----------
- * pgstat_recv_analyze() -
+ * pgstat_setup_memcxt() -
  *
- *     Process an ANALYZE message.
+ *     Create pgStatLocalContext, if not already done.
  * ----------
  */
 static void
-pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
+pgstat_setup_memcxt(void)
 {
-       PgStat_StatDBEntry *dbentry;
-       PgStat_StatTabEntry *tabentry;
-
-       /*
-        * Don't create either the database or table entry if it doesn't already
-        * exist.  This avoids bloating the stats with entries for stuff that is
-        * only touched by analyze and not by live operations.
-        */
-       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-       if (dbentry == NULL)
-               return;
-
-       tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
-                                                  HASH_FIND, NULL);
-       if (tabentry == NULL)
-               return;
-
-       if (msg->m_autovacuum) 
-               tabentry->autovac_analyze_timestamp = msg->m_analyzetime;
-       else 
-               tabentry->analyze_timestamp = msg->m_analyzetime;
-       tabentry->n_live_tuples = msg->m_live_tuples;
-       tabentry->n_dead_tuples = msg->m_dead_tuples;
-       tabentry->last_anl_tuples = msg->m_live_tuples + msg->m_dead_tuples;
+       if (!pgStatLocalContext)
+               pgStatLocalContext = AllocSetContextCreate(TopMemoryContext,
+                                                                                                  "Statistics snapshot",
+                                                                                                  ALLOCSET_SMALL_MINSIZE,
+                                                                                                  ALLOCSET_SMALL_INITSIZE,
+                                                                                                  ALLOCSET_SMALL_MAXSIZE);
 }
 
+
 /* ----------
- * pgstat_recv_activity() -
+ * pgstat_clear_snapshot() -
  *
- *     Remember what the backend is doing.
+ *     Discard any data collected in the current transaction.  Any subsequent
+ *     request will cause new snapshots to be read.
+ *
+ *     This is also invoked during transaction commit or abort to discard
+ *     the no-longer-wanted snapshot.
  * ----------
  */
-static void
-pgstat_recv_activity(PgStat_MsgActivity *msg, int len)
+void
+pgstat_clear_snapshot(void)
 {
-       PgStat_StatBeEntry *entry;
-
-       /*
-        * Here we check explicitly for 0 return, since we don't want to mangle
-        * the activity of an active backend by a delayed packet from a dead one.
-        */
-       if (pgstat_add_backend(&msg->m_hdr) != 0)
-               return;
-
-       entry = &(pgStatBeTable[msg->m_hdr.m_backendid - 1]);
-
-       StrNCpy(entry->activity, msg->m_cmd_str, PGSTAT_ACTIVITY_SIZE);
-
-       entry->activity_start_timestamp = GetCurrentTimestamp();
+       /* Release memory, if any was allocated */
+       if (pgStatLocalContext)
+               MemoryContextDelete(pgStatLocalContext);
+
+       /* Reset variables */
+       pgStatLocalContext = NULL;
+       pgStatDBHash = NULL;
+       localBackendStatusTable = NULL;
+       localNumBackends = 0;
 }
 
 
@@ -2886,12 +2850,6 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
        int                     i;
        bool            found;
 
-       /*
-        * Make sure the backend is counted for.
-        */
-       if (pgstat_add_backend(&msg->m_hdr) < 0)
-               return;
-
        dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
        /*
@@ -2915,50 +2873,50 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
                         * If it's a new table entry, initialize counters to the values we
                         * just got.
                         */
-                       tabentry->numscans = tabmsg[i].t_numscans;
-                       tabentry->tuples_returned = tabmsg[i].t_tuples_returned;
-                       tabentry->tuples_fetched = tabmsg[i].t_tuples_fetched;
-                       tabentry->tuples_inserted = tabmsg[i].t_tuples_inserted;
-                       tabentry->tuples_updated = tabmsg[i].t_tuples_updated;
-                       tabentry->tuples_deleted = tabmsg[i].t_tuples_deleted;
-
-                       tabentry->n_live_tuples = tabmsg[i].t_tuples_inserted;
-                       tabentry->n_dead_tuples = tabmsg[i].t_tuples_updated +
-                               tabmsg[i].t_tuples_deleted;
+                       tabentry->numscans = tabmsg[i].t_counts.t_numscans;
+                       tabentry->tuples_returned = tabmsg[i].t_counts.t_tuples_returned;
+                       tabentry->tuples_fetched = tabmsg[i].t_counts.t_tuples_fetched;
+                       tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted;
+                       tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated;
+                       tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted;
+                       tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples;
+                       tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples;
+                       tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched;
+                       tabentry->blocks_hit = tabmsg[i].t_counts.t_blocks_hit;
+
                        tabentry->last_anl_tuples = 0;
                        tabentry->vacuum_timestamp = 0;
                        tabentry->autovac_vacuum_timestamp = 0;
                        tabentry->analyze_timestamp = 0;
                        tabentry->autovac_analyze_timestamp = 0;
-
-                       tabentry->blocks_fetched = tabmsg[i].t_blocks_fetched;
-                       tabentry->blocks_hit = tabmsg[i].t_blocks_hit;
                }
                else
                {
                        /*
                         * Otherwise add the values to the existing entry.
                         */
-                       tabentry->numscans += tabmsg[i].t_numscans;
-                       tabentry->tuples_returned += tabmsg[i].t_tuples_returned;
-                       tabentry->tuples_fetched += tabmsg[i].t_tuples_fetched;
-                       tabentry->tuples_inserted += tabmsg[i].t_tuples_inserted;
-                       tabentry->tuples_updated += tabmsg[i].t_tuples_updated;
-                       tabentry->tuples_deleted += tabmsg[i].t_tuples_deleted;
-
-                       tabentry->n_live_tuples += tabmsg[i].t_tuples_inserted;
-                       tabentry->n_dead_tuples += tabmsg[i].t_tuples_updated +
-                               tabmsg[i].t_tuples_deleted;
-
-                       tabentry->blocks_fetched += tabmsg[i].t_blocks_fetched;
-                       tabentry->blocks_hit += tabmsg[i].t_blocks_hit;
+                       tabentry->numscans += tabmsg[i].t_counts.t_numscans;
+                       tabentry->tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
+                       tabentry->tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
+                       tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
+                       tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
+                       tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+                       tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples;
+                       tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples;
+                       tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
+                       tabentry->blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
                }
 
                /*
-                * And add the block IO to the database entry.
+                * Add per-table stats to the per-database entry, too.
                 */
-               dbentry->n_blocks_fetched += tabmsg[i].t_blocks_fetched;
-               dbentry->n_blocks_hit += tabmsg[i].t_blocks_hit;
+               dbentry->n_tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
+               dbentry->n_tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
+               dbentry->n_tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
+               dbentry->n_tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
+               dbentry->n_tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+               dbentry->n_blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
+               dbentry->n_blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
        }
 }
 
@@ -2975,12 +2933,6 @@ pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
        PgStat_StatDBEntry *dbentry;
        int                     i;
 
-       /*
-        * Make sure the backend is counted for.
-        */
-       if (pgstat_add_backend(&msg->m_hdr) < 0)
-               return;
-
        dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
 
        /*
@@ -3013,12 +2965,6 @@ pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
 {
        PgStat_StatDBEntry *dbentry;
 
-       /*
-        * Make sure the backend is counted for.
-        */
-       if (pgstat_add_backend(&msg->m_hdr) < 0)
-               return;
-
        /*
         * Lookup the database in the hashtable.
         */
@@ -3054,12 +3000,6 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
        HASHCTL         hash_ctl;
        PgStat_StatDBEntry *dbentry;
 
-       /*
-        * Make sure the backend is counted for.
-        */
-       if (pgstat_add_backend(&msg->m_hdr) < 0)
-               return;
-
        /*
         * Lookup the database in the hashtable.  Nothing to do if not there.
         */
@@ -3090,3 +3030,133 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
                                                                  &hash_ctl,
                                                                  HASH_ELEM | HASH_FUNCTION);
 }
+
+/* ----------
+ * pgstat_recv_autovac() -
+ *
+ *     Process an autovacuum signalling message.
+ * ----------
+ */
+static void
+pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
+{
+       PgStat_StatDBEntry *dbentry;
+
+       /*
+        * Lookup the database in the hashtable.  Don't create the entry if it
+        * doesn't exist, because autovacuum may be processing a template
+        * database.  If this isn't the case, the database is most likely to have
+        * an entry already.  (If it doesn't, not much harm is done anyway --
+        * it'll get created as soon as somebody actually uses the database.)
+        */
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
+       if (dbentry == NULL)
+               return;
+
+       /*
+        * Store the last autovacuum time in the database entry.
+        */
+       dbentry->last_autovac_time = msg->m_start_time;
+}
+
+/* ----------
+ * pgstat_recv_vacuum() -
+ *
+ *     Process a VACUUM message.
+ * ----------
+ */
+static void
+pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
+{
+       PgStat_StatDBEntry *dbentry;
+       PgStat_StatTabEntry *tabentry;
+
+       /*
+        * Don't create either the database or table entry if it doesn't already
+        * exist.  This avoids bloating the stats with entries for stuff that is
+        * only touched by vacuum and not by live operations.
+        */
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
+       if (dbentry == NULL)
+               return;
+
+       tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
+                                                  HASH_FIND, NULL);
+       if (tabentry == NULL)
+               return;
+
+       if (msg->m_autovacuum)
+               tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime;
+       else
+               tabentry->vacuum_timestamp = msg->m_vacuumtime;
+       tabentry->n_live_tuples = msg->m_tuples;
+       tabentry->n_dead_tuples = 0;
+       if (msg->m_analyze)
+       {
+               tabentry->last_anl_tuples = msg->m_tuples;
+               if (msg->m_autovacuum)
+                       tabentry->autovac_analyze_timestamp = msg->m_vacuumtime;
+               else
+                       tabentry->analyze_timestamp = msg->m_vacuumtime;
+       }
+       else
+       {
+               /* last_anl_tuples must never exceed n_live_tuples */
+               tabentry->last_anl_tuples = Min(tabentry->last_anl_tuples,
+                                                                               msg->m_tuples);
+       }
+}
+
+/* ----------
+ * pgstat_recv_analyze() -
+ *
+ *     Process an ANALYZE message.
+ * ----------
+ */
+static void
+pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
+{
+       PgStat_StatDBEntry *dbentry;
+       PgStat_StatTabEntry *tabentry;
+
+       /*
+        * Don't create either the database or table entry if it doesn't already
+        * exist.  This avoids bloating the stats with entries for stuff that is
+        * only touched by analyze and not by live operations.
+        */
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
+       if (dbentry == NULL)
+               return;
+
+       tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
+                                                  HASH_FIND, NULL);
+       if (tabentry == NULL)
+               return;
+
+       if (msg->m_autovacuum)
+               tabentry->autovac_analyze_timestamp = msg->m_analyzetime;
+       else
+               tabentry->analyze_timestamp = msg->m_analyzetime;
+       tabentry->n_live_tuples = msg->m_live_tuples;
+       tabentry->n_dead_tuples = msg->m_dead_tuples;
+       tabentry->last_anl_tuples = msg->m_live_tuples + msg->m_dead_tuples;
+}
+
+
+/* ----------
+ * pgstat_recv_bgwriter() -
+ *
+ *     Process a BGWRITER message.
+ * ----------
+ */
+static void
+pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
+{
+       globalStats.timed_checkpoints += msg->m_timed_checkpoints;
+       globalStats.requested_checkpoints += msg->m_requested_checkpoints;
+       globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
+       globalStats.buf_written_lru += msg->m_buf_written_lru;
+       globalStats.buf_written_all += msg->m_buf_written_all;
+       globalStats.maxwritten_lru += msg->m_maxwritten_lru;
+       globalStats.maxwritten_all += msg->m_maxwritten_all;
+}