X-Git-Url: https://granicus.if.org/sourcecode?a=blobdiff_plain;f=src%2Fbackend%2Fpostmaster%2Fpgstat.c;h=b41a16de44ce86435068597a40e0fa3537ccd08b;hb=77947c51c08179b8bc12347a7fbcb2c8d7908302;hp=04d8890afd6bc0f4246518003dcd97489c578caa;hpb=dbf53e6345b17be375d35043b6a0e64276f87b75;p=postgresql

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 04d8890afd..b41a16de44 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -11,9 +11,9 @@
  *			- Add a pgstat config column to pg_database, so this
  *			  entire thing can be enabled/disabled on a per db basis.
  *
- *	Copyright (c) 2001-2005, PostgreSQL Global Development Group
+ *	Copyright (c) 2001-2007, PostgreSQL Global Development Group
  *
- *	$PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.115 2005/12/31 19:39:10 momjian Exp $
+ *	$PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.156 2007/05/27 03:50:39 tgl Exp $
  * ----------
  */
 #include "postgres.h"
@@ -28,12 +28,21 @@
 #include <arpa/inet.h>
 #include <signal.h>
 #include <time.h>
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
+#ifdef HAVE_SYS_POLL_H
+#include <sys/poll.h>
+#endif
 
 #include "pgstat.h"
 
 #include "access/heapam.h"
+#include "access/transam.h"
+#include "access/twophase_rmgr.h"
 #include "access/xact.h"
 #include "catalog/pg_database.h"
+#include "libpq/ip.h"
 #include "libpq/libpq.h"
 #include "libpq/pqsignal.h"
 #include "mb/pg_wchar.h"
@@ -46,13 +55,8 @@
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
-#include "storage/procarray.h"
-#include "tcop/tcopprot.h"
-#include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
-#include "utils/rel.h"
-#include "utils/syscache.h"
 
 
 /* ----------
@@ -69,28 +73,19 @@
 #define PGSTAT_STAT_INTERVAL	500		/* How often to write the status file;
 										 * in milliseconds. */
 
-#define PGSTAT_DESTROY_DELAY	10000	/* How long to keep destroyed objects
-										 * known, to give delayed UDP packets
-										 * time to arrive; in milliseconds. */
-
-#define PGSTAT_DESTROY_COUNT	(PGSTAT_DESTROY_DELAY / PGSTAT_STAT_INTERVAL)
-
 #define PGSTAT_RESTART_INTERVAL 60		/* How often to attempt to restart a
 										 * failed statistics collector; in
 										 * seconds. */
 
-/* ----------
- * Amount of space reserved in pgstat_recvbuffer().
- * ----------
- */
-#define PGSTAT_RECVBUFFERSZ		((int) (1024 * sizeof(PgStat_Msg)))
+#define PGSTAT_SELECT_TIMEOUT	2		/* How often to check for postmaster
+										 * death; in seconds. */
+
 
 /* ----------
  * The initial size hints for the hash tables used in the collector.
  * ----------
  */
 #define PGSTAT_DB_HASH_SIZE		16
-#define PGSTAT_BE_HASH_SIZE		512
 #define PGSTAT_TAB_HASH_SIZE	512
 
 
@@ -100,50 +95,96 @@
  */
 bool		pgstat_collect_startcollector = true;
 bool		pgstat_collect_resetonpmstart = false;
-bool		pgstat_collect_querystring = false;
 bool		pgstat_collect_tuplelevel = false;
 bool		pgstat_collect_blocklevel = false;
+bool		pgstat_collect_querystring = false;
+
+/*
+ * BgWriter global statistics counters (unused in other processes).
+ * Stored directly in a stats message structure so it can be sent
+ * without needing to copy things around.  We assume this inits to zeroes.
+ */
+PgStat_MsgBgWriter BgWriterStats;
 
 /* ----------
  * Local data
  * ----------
  */
 NON_EXEC_STATIC int pgStatSock = -1;
-NON_EXEC_STATIC int pgStatPipe[2] = {-1, -1};
+
 static struct sockaddr_storage pgStatAddr;
-static pid_t pgStatCollectorPid = 0;
 
 static time_t last_pgstat_start_time;
 
-static long pgStatNumMessages = 0;
-
-static bool pgStatRunningInCollector = FALSE;
+static bool pgStatRunningInCollector = false;
 
 /*
- * Place where backends store per-table info to be sent to the collector.
- * We store shared relations separately from non-shared ones, to be able to
- * send them in separate messages.
+ * Structures in which backends store per-table info that's waiting to be
+ * sent to the collector.
+ *
+ * NOTE: once allocated, TabStatusArray structures are never moved or deleted
+ * for the life of the backend.  Also, we zero out the t_id fields of the
+ * contained PgStat_TableStatus structs whenever they are not actively in use.
+ * This allows relcache pgstat_info pointers to be treated as long-lived data,
+ * avoiding repeated searches in pgstat_initstats() when a relation is
+ * repeatedly opened during a transaction.
  */
-typedef struct TabStatArray
+#define TABSTAT_QUANTUM		100			/* we alloc this many at a time */
+
+typedef struct TabStatusArray
 {
-	int			tsa_alloc;		/* num allocated */
-	int			tsa_used;		/* num actually used */
-	PgStat_MsgTabstat **tsa_messages;	/* the array itself */
-} TabStatArray;
+	struct TabStatusArray *tsa_next;	/* link to next array, if any */
+	int			tsa_used;				/* # entries currently used */
+	PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM];	/* per-table data */
+} TabStatusArray;
+
+static TabStatusArray *pgStatTabList = NULL;
 
-#define TABSTAT_QUANTUM		4	/* we alloc this many at a time */
+/*
+ * Tuple insertion/deletion counts for an open transaction can't be propagated
+ * into PgStat_TableStatus counters until we know if it is going to commit
+ * or abort.  Hence, we keep these counts in per-subxact structs that live
+ * in TopTransactionContext.  This data structure is designed on the assumption
+ * that subxacts won't usually modify very many tables.
+ */
+typedef struct PgStat_SubXactStatus
+{
+	int			nest_level;				/* subtransaction nest level */
+	struct PgStat_SubXactStatus *prev;	/* higher-level subxact if any */
+	PgStat_TableXactStatus *first;		/* head of list for this subxact */
+} PgStat_SubXactStatus;
 
-static TabStatArray RegularTabStat = {0, 0, NULL};
-static TabStatArray SharedTabStat = {0, 0, NULL};
+static PgStat_SubXactStatus *pgStatXactStack = NULL;
 
 static int	pgStatXactCommit = 0;
 static int	pgStatXactRollback = 0;
 
-static TransactionId pgStatDBHashXact = InvalidTransactionId;
+/* Record that's written to 2PC state file when pgstat state is persisted */
+typedef struct TwoPhasePgStatRecord
+{
+	PgStat_Counter tuples_inserted;	/* tuples inserted in xact */
+	PgStat_Counter tuples_deleted;	/* tuples deleted in xact */
+	Oid			t_id;				/* table's OID */
+	bool		t_shared;			/* is it a shared catalog? */
+} TwoPhasePgStatRecord;
+
+/*
+ * Info about current "snapshot" of stats file
+ */
+static MemoryContext pgStatLocalContext = NULL;
 static HTAB *pgStatDBHash = NULL;
-static HTAB *pgStatBeDead = NULL;
-static PgStat_StatBeEntry *pgStatBeTable = NULL;
-static int	pgStatNumBackends = 0;
+static PgBackendStatus *localBackendStatusTable = NULL;
+static int	localNumBackends = 0;
+
+/*
+ * Cluster wide statistics, kept in the stats collector.
+ * Contains statistics that are not collected per database
+ * or per table.
+ */
+static PgStat_GlobalStats globalStats;
+
+static volatile bool need_exit = false;
+static volatile bool need_statwrite = false;
 
 
 /* ----------
@@ -151,40 +192,30 @@ static int	pgStatNumBackends = 0;
  * ----------
  */
 #ifdef EXEC_BACKEND
-
-typedef enum STATS_PROCESS_TYPE
-{
-	STAT_PROC_BUFFER,
-	STAT_PROC_COLLECTOR
-}	STATS_PROCESS_TYPE;
-
-static pid_t pgstat_forkexec(STATS_PROCESS_TYPE procType);
-static void pgstat_parseArgs(int argc, char *argv[]);
+static pid_t pgstat_forkexec(void);
 #endif
 
-NON_EXEC_STATIC void PgstatBufferMain(int argc, char *argv[]);
 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]);
-static void pgstat_recvbuffer(void);
 static void pgstat_exit(SIGNAL_ARGS);
-static void pgstat_die(SIGNAL_ARGS);
+static void force_statwrite(SIGNAL_ARGS);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
-static int	pgstat_add_backend(PgStat_MsgHdr *msg);
-static void pgstat_sub_backend(int procpid);
-static void pgstat_drop_database(Oid databaseid);
 static void pgstat_write_statsfile(void);
-static void pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
-					  PgStat_StatBeEntry **betab,
-					  int *numbackends);
+static HTAB *pgstat_read_statsfile(Oid onlydb);
 static void backend_read_statsfile(void);
+static void pgstat_read_current_status(void);
+
+static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
+static HTAB *pgstat_collect_oids(Oid catalogid);
+
+static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
+
+static void pgstat_setup_memcxt(void);
 
 static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
 static void pgstat_send(void *msg, int len);
 
-static void pgstat_recv_bestart(PgStat_MsgBestart *msg, int len);
-static void pgstat_recv_beterm(PgStat_MsgBeterm *msg, int len);
-static void pgstat_recv_activity(PgStat_MsgActivity *msg, int len);
 static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
 static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
 static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
@@ -192,6 +223,7 @@ static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
 static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
 static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
+static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
 
 
 /* ------------------------------------------------------------
@@ -220,14 +252,16 @@ pgstat_init(void)
 	struct timeval tv;
 	char		test_byte;
 	int			sel_res;
+	int			tries = 0;
 
 #define TESTBYTEVAL ((char) 199)
 
 	/*
-	 * Force start of collector daemon if something to collect
+	 * Force start of collector daemon if something to collect.  Note that
+	 * pgstat_collect_querystring is now an independent facility that does not
+	 * require the collector daemon.
 	 */
-	if (pgstat_collect_querystring ||
-		pgstat_collect_tuplelevel ||
+	if (pgstat_collect_tuplelevel ||
 		pgstat_collect_blocklevel)
 		pgstat_collect_startcollector = true;
 
@@ -280,6 +314,10 @@ pgstat_init(void)
 			continue;
 #endif
 
+		if (++tries > 1)
+			ereport(LOG,
+			(errmsg("trying another address for the statistics collector")));
+
 		/*
 		 * Create the socket.
 		 */
@@ -339,8 +377,12 @@ pgstat_init(void)
 		 * rules prevent it).
 		 */
 		test_byte = TESTBYTEVAL;
+
+retry1:
 		if (send(pgStatSock, &test_byte, 1, 0) != 1)
 		{
+			if (errno == EINTR)
+				goto retry1;	/* if interrupted, just retry */
 			ereport(LOG,
 					(errcode_for_socket_access(),
 					 errmsg("could not send test message on socket for statistics collector: %m")));
@@ -391,8 +433,11 @@ pgstat_init(void)
 
 		test_byte++;			/* just make sure variable is changed */
 
+retry2:
 		if (recv(pgStatSock, &test_byte, 1, 0) != 1)
 		{
+			if (errno == EINTR)
+				goto retry2;	/* if interrupted, just retry */
 			ereport(LOG,
 					(errcode_for_socket_access(),
 					 errmsg("could not receive test message on socket for statistics collector: %m")));
@@ -421,9 +466,8 @@ pgstat_init(void)
 
 	/*
 	 * Set the socket to non-blocking IO.  This ensures that if the collector
-	 * falls behind (despite the buffering process), statistics messages will
-	 * be discarded; backends won't block waiting to send messages to the
-	 * collector.
+	 * falls behind, statistics messages will be discarded; backends won't
+	 * block waiting to send messages to the collector.
 	 */
 	if (!pg_set_noblock(pgStatSock))
 	{
@@ -450,7 +494,6 @@ startup_failed:
 
 	/* Adjust GUC variables to suppress useless activity */
 	pgstat_collect_startcollector = false;
-	pgstat_collect_querystring = false;
 	pgstat_collect_tuplelevel = false;
 	pgstat_collect_blocklevel = false;
 }
@@ -473,65 +516,23 @@ pgstat_reset_all(void)
 /*
  * pgstat_forkexec() -
  *
- * Format up the arglist for, then fork and exec, statistics
- * (buffer and collector) processes
+ * Format up the arglist for, then fork and exec, statistics collector process
  */
 static pid_t
-pgstat_forkexec(STATS_PROCESS_TYPE procType)
+pgstat_forkexec(void)
 {
 	char	   *av[10];
-	int			ac = 0,
-				bufc = 0,
-				i;
-	char		pgstatBuf[2][32];
+	int			ac = 0;
 
 	av[ac++] = "postgres";
-
-	switch (procType)
-	{
-		case STAT_PROC_BUFFER:
-			av[ac++] = "-forkbuf";
-			break;
-
-		case STAT_PROC_COLLECTOR:
-			av[ac++] = "-forkcol";
-			break;
-
-		default:
-			Assert(false);
-	}
-
+	av[ac++] = "--forkcol";
 	av[ac++] = NULL;			/* filled in by postmaster_forkexec */
 
-	/* postgres_exec_path is not passed by write_backend_variables */
-	av[ac++] = postgres_exec_path;
-
-	/* Add to the arg list */
-	Assert(bufc <= lengthof(pgstatBuf));
-	for (i = 0; i < bufc; i++)
-		av[ac++] = pgstatBuf[i];
-
 	av[ac] = NULL;
 	Assert(ac < lengthof(av));
 
 	return postmaster_forkexec(ac, av);
 }
-
-
-/*
- * pgstat_parseArgs() -
- *
- * Extract data from the arglist for exec'ed statistics
- * (buffer and collector) processes
- */
-static void
-pgstat_parseArgs(int argc, char *argv[])
-{
-	Assert(argc == 4);
-
-	argc = 3;
-	StrNCpy(postgres_exec_path, argv[argc++], MAXPGPATH);
-}
 #endif   /* EXEC_BACKEND */
 
 
@@ -590,14 +591,14 @@ pgstat_start(void)
 	 * Okay, fork off the collector.
 	 */
 #ifdef EXEC_BACKEND
-	switch ((pgStatPid = pgstat_forkexec(STAT_PROC_BUFFER)))
+	switch ((pgStatPid = pgstat_forkexec()))
 #else
 	switch ((pgStatPid = fork_process()))
 #endif
 	{
 		case -1:
 			ereport(LOG,
-					(errmsg("could not fork statistics buffer: %m")));
+					(errmsg("could not fork statistics collector: %m")));
 			return 0;
 
 #ifndef EXEC_BACKEND
@@ -606,10 +607,13 @@ pgstat_start(void)
 			/* Close the postmaster's sockets */
 			ClosePostmasterPorts(false);
 
+			/* Lose the postmaster's on-exit routines */
+			on_exit_reset();
+
 			/* Drop our connection to postmaster's shared memory, as well */
 			PGSharedMemoryDetach();
 
-			PgstatBufferMain(0, NULL);
+			PgstatCollectorMain(0, NULL);
 			break;
 #endif
 
@@ -621,51 +625,9 @@ pgstat_start(void)
 	return 0;
 }
 
-
-/* ----------
- * pgstat_beterm() -
- *
- *	Called from postmaster to tell collector a backend terminated.
- * ----------
- */
-void
-pgstat_beterm(int pid)
-{
-	PgStat_MsgBeterm msg;
-
-	if (pgStatSock < 0)
-		return;
-
-	/* can't use pgstat_setheader() because it's not called in a backend */
-	MemSet(&(msg.m_hdr), 0, sizeof(msg.m_hdr));
-	msg.m_hdr.m_type = PGSTAT_MTYPE_BETERM;
-	msg.m_hdr.m_procpid = pid;
-
-	pgstat_send(&msg, sizeof(msg));
-}
-
-
-/* ----------
- * pgstat_report_autovac() -
- *
- *	Called from autovacuum.c to report startup of an autovacuum process.
- *	We are called before InitPostgres is done, so can't rely on MyDatabaseId;
- *	the db OID must be passed in, instead.
- * ----------
- */
-void
-pgstat_report_autovac(Oid dboid)
+void allow_immediate_pgstat_restart(void)
 {
-	PgStat_MsgAutovacStart msg;
-
-	if (pgStatSock < 0)
-		return;
-
-	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
-	msg.m_databaseid = dboid;
-	msg.m_start_time = GetCurrentTimestamp();
-
-	pgstat_send(&msg, sizeof(msg));
+		last_pgstat_start_time = 0;
 }
 
 /* ------------------------------------------------------------
@@ -675,196 +637,138 @@ pgstat_report_autovac(Oid dboid)
 
 
 /* ----------
- * pgstat_bestart() -
+ * pgstat_report_tabstat() -
  *
- *	Tell the collector that this new backend is soon ready to process
- *	queries. Called from InitPostgres.
+ *	Called from tcop/postgres.c to send the so far collected per-table
+ *	access statistics to the collector.  Note that this is called only
+ *	when not within a transaction, so it is fair to use transaction stop
+ *	time as an approximation of current time.
  * ----------
  */
 void
-pgstat_bestart(void)
+pgstat_report_tabstat(bool force)
 {
-	PgStat_MsgBestart msg;
+	/* we assume this inits to all zeroes: */
+	static const PgStat_TableCounts all_zeroes;
+	static TimestampTz last_report = 0;	
+
+	TimestampTz now;
+	PgStat_MsgTabstat regular_msg;
+	PgStat_MsgTabstat shared_msg;
+	TabStatusArray *tsa;
+	int			i;
 
-	if (pgStatSock < 0)
+	/* Don't expend a clock check if nothing to do */
+	if (pgStatTabList == NULL ||
+		pgStatTabList->tsa_used == 0)
 		return;
 
 	/*
-	 * We may not have a MyProcPort (eg, if this is the autovacuum process).
-	 * For the moment, punt and don't send BESTART --- would be better to work
-	 * out a clean way of handling "unknown clientaddr".
+	 * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
+	 * msec since we last sent one, or the caller wants to force stats out.
 	 */
-	if (MyProcPort)
-	{
-		pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_BESTART);
-		msg.m_databaseid = MyDatabaseId;
-		msg.m_userid = GetSessionUserId();
-		memcpy(&msg.m_clientaddr, &MyProcPort->raddr, sizeof(msg.m_clientaddr));
-		pgstat_send(&msg, sizeof(msg));
-	}
+	now = GetCurrentTransactionStopTimestamp();
+	if (!force &&
+		!TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL))
+		return;
+	last_report = now;
 
 	/*
-	 * Set up a process-exit hook to ensure we flush the last batch of
-	 * statistics to the collector.
+	 * Scan through the TabStatusArray struct(s) to find tables that actually
+	 * have counts, and build messages to send.  We have to separate shared
+	 * relations from regular ones because the databaseid field in the
+	 * message header has to depend on that.
 	 */
-	on_shmem_exit(pgstat_beshutdown_hook, 0);
-}
+	regular_msg.m_databaseid = MyDatabaseId;
+	shared_msg.m_databaseid = InvalidOid;
+	regular_msg.m_nentries = 0;
+	shared_msg.m_nentries = 0;
 
-/* ---------
- * pgstat_report_vacuum() -
- *
- *	Tell the collector about the table we just vacuumed.
- * ---------
- */
-void
-pgstat_report_vacuum(Oid tableoid, bool shared,
-					 bool analyze, PgStat_Counter tuples)
-{
-	PgStat_MsgVacuum msg;
-
-	if (pgStatSock < 0)
-		return;
-
-	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
-	msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
-	msg.m_tableoid = tableoid;
-	msg.m_analyze = analyze;
-	msg.m_tuples = tuples;
-	pgstat_send(&msg, sizeof(msg));
-}
+	for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
+	{
+		for (i = 0; i < tsa->tsa_used; i++)
+		{
+			PgStat_TableStatus *entry = &tsa->tsa_entries[i];
+			PgStat_MsgTabstat *this_msg;
+			PgStat_TableEntry *this_ent;
 
-/* --------
- * pgstat_report_analyze() -
- *
- *	Tell the collector about the table we just analyzed.
- * --------
- */
-void
-pgstat_report_analyze(Oid tableoid, bool shared, PgStat_Counter livetuples,
-					  PgStat_Counter deadtuples)
-{
-	PgStat_MsgAnalyze msg;
+			/* Shouldn't have any pending transaction-dependent counts */
+			Assert(entry->trans == NULL);
 
-	if (pgStatSock < 0)
-		return;
+			/*
+			 * Ignore entries that didn't accumulate any actual counts,
+			 * such as indexes that were opened by the planner but not used.
+			 */
+			if (memcmp(&entry->t_counts, &all_zeroes,
+					   sizeof(PgStat_TableCounts)) == 0)
+				continue;
+			/*
+			 * OK, insert data into the appropriate message, and send if full.
+			 */
+			this_msg = entry->t_shared ? &shared_msg : &regular_msg;
+			this_ent = &this_msg->m_entry[this_msg->m_nentries];
+			this_ent->t_id = entry->t_id;
+			memcpy(&this_ent->t_counts, &entry->t_counts,
+				   sizeof(PgStat_TableCounts));
+			if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
+			{
+				pgstat_send_tabstat(this_msg);
+				this_msg->m_nentries = 0;
+			}
+		}
+		/* zero out TableStatus structs after use */
+		MemSet(tsa->tsa_entries, 0,
+			   tsa->tsa_used * sizeof(PgStat_TableStatus));
+		tsa->tsa_used = 0;
+	}
 
-	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
-	msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
-	msg.m_tableoid = tableoid;
-	msg.m_live_tuples = livetuples;
-	msg.m_dead_tuples = deadtuples;
-	pgstat_send(&msg, sizeof(msg));
+	/*
+	 * Send partial messages.  If force is true, make sure that any pending
+	 * xact commit/abort gets counted, even if no table stats to send.
+	 */
+	if (regular_msg.m_nentries > 0 ||
+		(force && (pgStatXactCommit > 0 || pgStatXactRollback > 0)))
+		pgstat_send_tabstat(&regular_msg);
+	if (shared_msg.m_nentries > 0)
+		pgstat_send_tabstat(&shared_msg);
 }
 
 /*
- * Flush any remaining statistics counts out to the collector at process
- * exit.   Without this, operations triggered during backend exit (such as
- * temp table deletions) won't be counted.
+ * Subroutine for pgstat_report_tabstat: finish and send a tabstat message
  */
 static void
-pgstat_beshutdown_hook(int code, Datum arg)
-{
-	pgstat_report_tabstat();
-}
-
-
-/* ----------
- * pgstat_report_activity() -
- *
- *	Called from tcop/postgres.c to tell the collector what the backend
- *	is actually doing (usually "<IDLE>" or the start of the query to
- *	be executed).
- * ----------
- */
-void
-pgstat_report_activity(const char *cmd_str)
+pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
 {
-	PgStat_MsgActivity msg;
+	int			n;
 	int			len;
 
-	if (!pgstat_collect_querystring || pgStatSock < 0)
-		return;
-
-	len = strlen(cmd_str);
-	len = pg_mbcliplen(cmd_str, len, PGSTAT_ACTIVITY_SIZE - 1);
-
-	memcpy(msg.m_cmd_str, cmd_str, len);
-	msg.m_cmd_str[len] = '\0';
-	len += offsetof(PgStat_MsgActivity, m_cmd_str) + 1;
-
-	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ACTIVITY);
-	pgstat_send(&msg, len);
-}
-
-
-/* ----------
- * pgstat_report_tabstat() -
- *
- *	Called from tcop/postgres.c to send the so far collected
- *	per table access statistics to the collector.
- * ----------
- */
-void
-pgstat_report_tabstat(void)
-{
-	int			i;
-
-	if (pgStatSock < 0 ||
-		!(pgstat_collect_querystring ||
-		  pgstat_collect_tuplelevel ||
-		  pgstat_collect_blocklevel))
-	{
-		/* Not reporting stats, so just flush whatever we have */
-		RegularTabStat.tsa_used = 0;
-		SharedTabStat.tsa_used = 0;
+	/* It's unlikely we'd get here with no socket, but maybe not impossible */
+	if (pgStatSock < 0)
 		return;
-	}
 
 	/*
-	 * For each message buffer used during the last query set the header
-	 * fields and send it out.
+	 * Report accumulated xact commit/rollback whenever we send a normal
+	 * tabstat message
 	 */
-	for (i = 0; i < RegularTabStat.tsa_used; i++)
+	if (OidIsValid(tsmsg->m_databaseid))
 	{
-		PgStat_MsgTabstat *tsmsg = RegularTabStat.tsa_messages[i];
-		int			n;
-		int			len;
-
-		n = tsmsg->m_nentries;
-		len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
-			n * sizeof(PgStat_TableEntry);
-
 		tsmsg->m_xact_commit = pgStatXactCommit;
 		tsmsg->m_xact_rollback = pgStatXactRollback;
 		pgStatXactCommit = 0;
 		pgStatXactRollback = 0;
-
-		pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
-		tsmsg->m_databaseid = MyDatabaseId;
-		pgstat_send(tsmsg, len);
 	}
-	RegularTabStat.tsa_used = 0;
-
-	/* Ditto, for shared relations */
-	for (i = 0; i < SharedTabStat.tsa_used; i++)
+	else
 	{
-		PgStat_MsgTabstat *tsmsg = SharedTabStat.tsa_messages[i];
-		int			n;
-		int			len;
-
-		n = tsmsg->m_nentries;
-		len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
-			n * sizeof(PgStat_TableEntry);
-
-		/* We don't report transaction commit/abort here */
 		tsmsg->m_xact_commit = 0;
 		tsmsg->m_xact_rollback = 0;
-
-		pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
-		tsmsg->m_databaseid = InvalidOid;
-		pgstat_send(tsmsg, len);
 	}
-	SharedTabStat.tsa_used = 0;
+
+	n = tsmsg->m_nentries;
+	len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
+		n * sizeof(PgStat_TableEntry);
+
+	pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
+	pgstat_send(tsmsg, len);
 }
 
 
@@ -874,26 +778,18 @@ pgstat_report_tabstat(void)
  *	Will tell the collector about objects he can get rid of.
  * ----------
  */
-int
+void
 pgstat_vacuum_tabstat(void)
 {
-	Relation	dbrel;
-	HeapScanDesc dbscan;
-	HeapTuple	dbtup;
-	Oid		   *dbidlist;
-	int			dbidalloc;
-	int			dbidused;
+	HTAB	   *htab;
+	PgStat_MsgTabpurge msg;
 	HASH_SEQ_STATUS hstat;
 	PgStat_StatDBEntry *dbentry;
 	PgStat_StatTabEntry *tabentry;
-	HeapTuple	reltup;
-	int			nobjects = 0;
-	PgStat_MsgTabpurge msg;
 	int			len;
-	int			i;
 
 	if (pgStatSock < 0)
-		return 0;
+		return;
 
 	/*
 	 * If not done for this transaction, read the statistics collector stats
@@ -902,15 +798,41 @@ pgstat_vacuum_tabstat(void)
 	backend_read_statsfile();
 
 	/*
-	 * Lookup our own database entry; if not found, nothing to do.
+	 * Read pg_database and make a list of OIDs of all existing databases
+	 */
+	htab = pgstat_collect_oids(DatabaseRelationId);
+
+	/*
+	 * Search the database hash table for dead databases and tell the
+	 * collector to drop them.
+	 */
+	hash_seq_init(&hstat, pgStatDBHash);
+	while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
+	{
+		Oid			dbid = dbentry->databaseid;
+
+		CHECK_FOR_INTERRUPTS();
+
+		if (hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL)
+			pgstat_drop_database(dbid);
+	}
+
+	/* Clean up */
+	hash_destroy(htab);
+
+	/*
+	 * Lookup our own database entry; if not found, nothing more to do.
 	 */
 	dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
 												 (void *) &MyDatabaseId,
 												 HASH_FIND, NULL);
-	if (dbentry == NULL)
-		return -1;
-	if (dbentry->tables == NULL)
-		return 0;
+	if (dbentry == NULL || dbentry->tables == NULL)
+		return;
+
+	/*
+	 * Similarly to above, make a list of all known relations in this DB.
+	 */
+	htab = pgstat_collect_oids(RelationRelationId);
 
 	/*
 	 * Initialize our messages table counter to zero
@@ -923,27 +845,20 @@ pgstat_vacuum_tabstat(void)
 	hash_seq_init(&hstat, dbentry->tables);
 	while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
 	{
-		/*
-		 * Check if this relation is still alive by looking up it's pg_class
-		 * tuple in the system catalog cache.
-		 */
-		reltup = SearchSysCache(RELOID,
-								ObjectIdGetDatum(tabentry->tableid),
-								0, 0, 0);
-		if (HeapTupleIsValid(reltup))
-		{
-			ReleaseSysCache(reltup);
+		Oid			tabid = tabentry->tableid;
+
+		CHECK_FOR_INTERRUPTS();
+
+		if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL)
 			continue;
-		}
 
 		/*
-		 * Add this table's Oid to the message
+		 * Not there, so add this table's Oid to the message
 		 */
-		msg.m_tableid[msg.m_nentries++] = tabentry->tableid;
-		nobjects++;
+		msg.m_tableid[msg.m_nentries++] = tabid;
 
 		/*
-		 * If the message is full, send it out and reinitialize to zero
+		 * If the message is full, send it out and reinitialize to empty
 		 */
 		if (msg.m_nentries >= PGSTAT_NUM_TABPURGE)
 		{
@@ -971,62 +886,51 @@ pgstat_vacuum_tabstat(void)
 		pgstat_send(&msg, len);
 	}
 
-	/*
-	 * Read pg_database and remember the Oid's of all existing databases
-	 */
-	dbidalloc = 256;
-	dbidused = 0;
-	dbidlist = (Oid *) palloc(sizeof(Oid) * dbidalloc);
+	/* Clean up */
+	hash_destroy(htab);
+}
 
-	dbrel = heap_open(DatabaseRelationId, AccessShareLock);
-	dbscan = heap_beginscan(dbrel, SnapshotNow, 0, NULL);
-	while ((dbtup = heap_getnext(dbscan, ForwardScanDirection)) != NULL)
-	{
-		if (dbidused >= dbidalloc)
-		{
-			dbidalloc *= 2;
-			dbidlist = (Oid *) repalloc((char *) dbidlist,
-										sizeof(Oid) * dbidalloc);
-		}
-		dbidlist[dbidused++] = HeapTupleGetOid(dbtup);
-	}
-	heap_endscan(dbscan);
-	heap_close(dbrel, AccessShareLock);
 
-	/*
-	 * Search the database hash table for dead databases and tell the
-	 * collector to drop them as well.
-	 */
-	hash_seq_init(&hstat, pgStatDBHash);
-	while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
+/* ----------
+ * pgstat_collect_oids() -
+ *
+ *	Collect the OIDs of either all databases or all tables, according to
+ *	the parameter, into a temporary hash table.  Caller should hash_destroy
+ *	the result when done with it.
+ * ----------
+ */
+static HTAB *
+pgstat_collect_oids(Oid catalogid)
+{
+	HTAB	   *htab;
+	HASHCTL		hash_ctl;
+	Relation	rel;
+	HeapScanDesc scan;
+	HeapTuple	tup;
+
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+	hash_ctl.keysize = sizeof(Oid);
+	hash_ctl.entrysize = sizeof(Oid);
+	hash_ctl.hash = oid_hash;
+	htab = hash_create("Temporary table of OIDs",
+					   PGSTAT_TAB_HASH_SIZE,
+					   &hash_ctl,
+					   HASH_ELEM | HASH_FUNCTION);
+
+	rel = heap_open(catalogid, AccessShareLock);
+	scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
+	while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
 	{
-		Oid			dbid = dbentry->databaseid;
+		Oid		thisoid = HeapTupleGetOid(tup);
 
-		for (i = 0; i < dbidused; i++)
-		{
-			if (dbidlist[i] == dbid)
-			{
-				dbid = InvalidOid;
-				break;
-			}
-		}
+		CHECK_FOR_INTERRUPTS();
 
-		if (dbid != InvalidOid)
-		{
-			pgstat_drop_database(dbid);
-			nobjects++;
-		}
+		(void) hash_search(htab, (void *) &thisoid, HASH_ENTER, NULL);
 	}
+	heap_endscan(scan);
+	heap_close(rel, AccessShareLock);
 
-	/*
-	 * Free the dbid list.
-	 */
-	pfree(dbidlist);
-
-	/*
-	 * Tell the caller how many removeable objects we found
-	 */
-	return nobjects;
+	return htab;
 }
 
 
@@ -1034,12 +938,11 @@ pgstat_vacuum_tabstat(void)
  * pgstat_drop_database() -
  *
  *	Tell the collector that we just dropped a database.
- *	This is the only message that shouldn't get lost in space. Otherwise
- *	the collector will keep the statistics for the dead DB until his
- *	stats file got removed while the postmaster is down.
+ *	(If the message gets lost, we will still clean the dead DB eventually
+ *	via future invocations of pgstat_vacuum_tabstat().)
  * ----------
  */
-static void
+void
 pgstat_drop_database(Oid databaseid)
 {
 	PgStat_MsgDropdb msg;
@@ -1053,6 +956,34 @@ pgstat_drop_database(Oid databaseid)
 }
 
 
+/* ----------
+ * pgstat_drop_relation() -
+ *
+ *	Tell the collector that we just dropped a relation.
+ *	(If the message gets lost, we will still clean the dead entry eventually
+ *	via future invocations of pgstat_vacuum_tabstat().)
+ * ----------
+ */
+void
+pgstat_drop_relation(Oid relid)
+{
+	PgStat_MsgTabpurge msg;
+	int			len;
+
+	if (pgStatSock < 0)
+		return;
+
+	msg.m_tableid[0] = relid;
+	msg.m_nentries = 1;
+
+	len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) +sizeof(Oid);
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
+	msg.m_databaseid = MyDatabaseId;
+	pgstat_send(&msg, len);
+}
+
+
 /* ----------
  * pgstat_reset_counters() -
  *
@@ -1079,262 +1010,640 @@ pgstat_reset_counters(void)
 
 
 /* ----------
- * pgstat_ping() -
+ * pgstat_report_autovac() -
  *
- *	Send some junk data to the collector to increase traffic.
+ *	Called from autovacuum.c to report startup of an autovacuum process.
+ *	We are called before InitPostgres is done, so can't rely on MyDatabaseId;
+ *	the db OID must be passed in, instead.
  * ----------
  */
 void
-pgstat_ping(void)
+pgstat_report_autovac(Oid dboid)
 {
-	PgStat_MsgDummy msg;
+	PgStat_MsgAutovacStart msg;
 
 	if (pgStatSock < 0)
 		return;
 
-	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
+	msg.m_databaseid = dboid;
+	msg.m_start_time = GetCurrentTimestamp();
+
 	pgstat_send(&msg, sizeof(msg));
 }
 
-/*
- * Enlarge a TabStatArray
+
+/* ---------
+ * pgstat_report_vacuum() -
+ *
+ *	Tell the collector about the table we just vacuumed.
+ * ---------
  */
-static void
-more_tabstat_space(TabStatArray *tsarr)
+void
+pgstat_report_vacuum(Oid tableoid, bool shared,
+					 bool analyze, PgStat_Counter tuples)
 {
-	PgStat_MsgTabstat *newMessages;
-	PgStat_MsgTabstat **msgArray;
-	int			newAlloc;
-	int			i;
+	PgStat_MsgVacuum msg;
+
+	if (pgStatSock < 0 ||
+		!pgstat_collect_tuplelevel)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
+	msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
+	msg.m_tableoid = tableoid;
+	msg.m_analyze = analyze;
+	msg.m_autovacuum = IsAutoVacuumWorkerProcess();	/* is this autovacuum? */
+	msg.m_vacuumtime = GetCurrentTimestamp();
+	msg.m_tuples = tuples;
+	pgstat_send(&msg, sizeof(msg));
+}
+
+/* --------
+ * pgstat_report_analyze() -
+ *
+ *	Tell the collector about the table we just analyzed.
+ * --------
+ */
+void
+pgstat_report_analyze(Oid tableoid, bool shared, PgStat_Counter livetuples,
+					  PgStat_Counter deadtuples)
+{
+	PgStat_MsgAnalyze msg;
 
-	AssertArg(PointerIsValid(tsarr));
+	if (pgStatSock < 0 ||
+		!pgstat_collect_tuplelevel)
+		return;
 
-	newAlloc = tsarr->tsa_alloc + TABSTAT_QUANTUM;
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
+	msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
+	msg.m_tableoid = tableoid;
+	msg.m_autovacuum = IsAutoVacuumWorkerProcess();	/* is this autovacuum? */
+	msg.m_analyzetime = GetCurrentTimestamp();
+	msg.m_live_tuples = livetuples;
+	msg.m_dead_tuples = deadtuples;
+	pgstat_send(&msg, sizeof(msg));
+}
 
-	/* Create (another) quantum of message buffers */
-	newMessages = (PgStat_MsgTabstat *)
-		MemoryContextAllocZero(TopMemoryContext,
-							   sizeof(PgStat_MsgTabstat) * TABSTAT_QUANTUM);
 
-	/* Create or enlarge the pointer array */
-	if (tsarr->tsa_messages == NULL)
-		msgArray = (PgStat_MsgTabstat **)
-			MemoryContextAlloc(TopMemoryContext,
-							   sizeof(PgStat_MsgTabstat *) * newAlloc);
-	else
-		msgArray = (PgStat_MsgTabstat **)
-			repalloc(tsarr->tsa_messages,
-					 sizeof(PgStat_MsgTabstat *) * newAlloc);
+/* ----------
+ * pgstat_ping() -
+ *
+ *	Send some junk data to the collector to increase traffic.
+ * ----------
+ */
+void
+pgstat_ping(void)
+{
+	PgStat_MsgDummy msg;
 
-	for (i = 0; i < TABSTAT_QUANTUM; i++)
-		msgArray[tsarr->tsa_alloc + i] = newMessages++;
-	tsarr->tsa_messages = msgArray;
-	tsarr->tsa_alloc = newAlloc;
+	if (pgStatSock < 0)
+		return;
 
-	Assert(tsarr->tsa_used < tsarr->tsa_alloc);
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
+	pgstat_send(&msg, sizeof(msg));
 }
 
+
 /* ----------
  * pgstat_initstats() -
  *
- *	Called from various places usually dealing with initialization
- *	of Relation or Scan structures. The data placed into these
- *	structures from here tell where later to count for buffer reads,
- *	scans and tuples fetched.
+ *	Initialize a relcache entry to count access statistics.
+ *	Called whenever a relation is opened.
+ *
+ *	We assume that a relcache entry's pgstat_info field is zeroed by
+ *	relcache.c when the relcache entry is made; thereafter it is long-lived
+ *	data.  We can avoid repeated searches of the TabStatus arrays when the
+ *	same relation is touched repeatedly within a transaction.
  * ----------
  */
 void
-pgstat_initstats(PgStat_Info *stats, Relation rel)
+pgstat_initstats(Relation rel)
 {
 	Oid			rel_id = rel->rd_id;
-	PgStat_TableEntry *useent;
-	TabStatArray *tsarr;
-	PgStat_MsgTabstat *tsmsg;
-	int			mb;
-	int			i;
+	char		relkind = rel->rd_rel->relkind;
 
-	/*
-	 * Initialize data not to count at all.
-	 */
-	stats->tabentry = NULL;
+	/* We only count stats for things that have storage */
+	if (!(relkind == RELKIND_RELATION ||
+		  relkind == RELKIND_INDEX ||
+		  relkind == RELKIND_TOASTVALUE))
+	{
+		rel->pgstat_info = NULL;
+		return;
+	}
 
 	if (pgStatSock < 0 ||
 		!(pgstat_collect_tuplelevel ||
 		  pgstat_collect_blocklevel))
+	{
+		/* We're not counting at all */
+		rel->pgstat_info = NULL;
+		return;
+	}
+
+	/*
+	 * If we already set up this relation in the current transaction,
+	 * nothing to do.
+	 */
+	if (rel->pgstat_info != NULL &&
+		rel->pgstat_info->t_id == rel_id)
 		return;
 
-	tsarr = rel->rd_rel->relisshared ? &SharedTabStat : &RegularTabStat;
+	/* Else find or make the PgStat_TableStatus entry, and update link */
+	rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
+}
+
+/*
+ * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
+ */
+static PgStat_TableStatus *
+get_tabstat_entry(Oid rel_id, bool isshared)
+{
+	PgStat_TableStatus *entry;
+	TabStatusArray *tsa;
+	TabStatusArray *prev_tsa;
+	int			i;
 
 	/*
-	 * Search the already-used message slots for this relation.
+	 * Search the already-used tabstat slots for this relation.
 	 */
-	for (mb = 0; mb < tsarr->tsa_used; mb++)
+	prev_tsa = NULL;
+	for (tsa = pgStatTabList; tsa != NULL; prev_tsa = tsa, tsa = tsa->tsa_next)
 	{
-		tsmsg = tsarr->tsa_messages[mb];
-
-		for (i = tsmsg->m_nentries; --i >= 0;)
+		for (i = 0; i < tsa->tsa_used; i++)
 		{
-			if (tsmsg->m_entry[i].t_id == rel_id)
-			{
-				stats->tabentry = (void *) &(tsmsg->m_entry[i]);
-				return;
-			}
+			entry = &tsa->tsa_entries[i];
+			if (entry->t_id == rel_id)
+				return entry;
 		}
 
-		if (tsmsg->m_nentries >= PGSTAT_NUM_TABENTRIES)
-			continue;
-
-		/*
-		 * Not found, but found a message buffer with an empty slot instead.
-		 * Fine, let's use this one.
-		 */
-		i = tsmsg->m_nentries++;
-		useent = &tsmsg->m_entry[i];
-		MemSet(useent, 0, sizeof(PgStat_TableEntry));
-		useent->t_id = rel_id;
-		stats->tabentry = (void *) useent;
-		return;
+		if (tsa->tsa_used < TABSTAT_QUANTUM)
+		{
+			/*
+			 * It must not be present, but we found a free slot instead.
+			 * Fine, let's use this one.  We assume the entry was already
+			 * zeroed, either at creation or after last use.
+			 */
+			entry = &tsa->tsa_entries[tsa->tsa_used++];
+			entry->t_id = rel_id;
+			entry->t_shared = isshared;
+			return entry;
+		}
 	}
 
 	/*
-	 * If we ran out of message buffers, we just allocate more.
+	 * We ran out of tabstat slots, so allocate more.  Be sure they're zeroed.
 	 */
-	if (tsarr->tsa_used >= tsarr->tsa_alloc)
-		more_tabstat_space(tsarr);
+	tsa = (TabStatusArray *) MemoryContextAllocZero(TopMemoryContext,
+													sizeof(TabStatusArray));
+	if (prev_tsa)
+		prev_tsa->tsa_next = tsa;
+	else
+		pgStatTabList = tsa;
 
 	/*
-	 * Use the first entry of the next message buffer.
+	 * Use the first entry of the new TabStatusArray.
 	 */
-	mb = tsarr->tsa_used++;
-	tsmsg = tsarr->tsa_messages[mb];
-	tsmsg->m_nentries = 1;
-	useent = &tsmsg->m_entry[0];
-	MemSet(useent, 0, sizeof(PgStat_TableEntry));
-	useent->t_id = rel_id;
-	stats->tabentry = (void *) useent;
+	entry = &tsa->tsa_entries[tsa->tsa_used++];
+	entry->t_id = rel_id;
+	entry->t_shared = isshared;
+	return entry;
 }
 
-
-/* ----------
- * pgstat_count_xact_commit() -
- *
- *	Called from access/transam/xact.c to count transaction commits.
- * ----------
+/*
+ * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed
  */
-void
-pgstat_count_xact_commit(void)
+static PgStat_SubXactStatus *
+get_tabstat_stack_level(int nest_level)
 {
-	if (!(pgstat_collect_querystring ||
-		  pgstat_collect_tuplelevel ||
-		  pgstat_collect_blocklevel))
-		return;
+	PgStat_SubXactStatus *xact_state;
+
+	xact_state = pgStatXactStack;
+	if (xact_state == NULL || xact_state->nest_level != nest_level)
+	{
+		xact_state = (PgStat_SubXactStatus *)
+			MemoryContextAlloc(TopTransactionContext,
+							   sizeof(PgStat_SubXactStatus));
+		xact_state->nest_level = nest_level;
+		xact_state->prev = pgStatXactStack;
+		xact_state->first = NULL;
+		pgStatXactStack = xact_state;
+	}
+	return xact_state;
+}
 
-	pgStatXactCommit++;
+/*
+ * add_tabstat_xact_level - add a new (sub)transaction state record
+ */
+static void
+add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
+{
+	PgStat_SubXactStatus *xact_state;
+	PgStat_TableXactStatus *trans;
 
 	/*
-	 * If there was no relation activity yet, just make one existing message
-	 * buffer used without slots, causing the next report to tell new
-	 * xact-counters.
+	 * If this is the first rel to be modified at the current nest level,
+	 * we first have to push a transaction stack entry.
 	 */
-	if (RegularTabStat.tsa_alloc == 0)
-		more_tabstat_space(&RegularTabStat);
+	xact_state = get_tabstat_stack_level(nest_level);
+
+	/* Now make a per-table stack entry */
+	trans = (PgStat_TableXactStatus *)
+		MemoryContextAllocZero(TopTransactionContext,
+							   sizeof(PgStat_TableXactStatus));
+	trans->nest_level = nest_level;
+	trans->upper = pgstat_info->trans;
+	trans->parent = pgstat_info;
+	trans->next = xact_state->first;
+	xact_state->first = trans;
+	pgstat_info->trans = trans;
+}
+
+/*
+ * pgstat_count_heap_insert - count a tuple insertion
+ */
+void
+pgstat_count_heap_insert(Relation rel)
+{
+	PgStat_TableStatus *pgstat_info = rel->pgstat_info;
 
-	if (RegularTabStat.tsa_used == 0)
+	if (pgstat_collect_tuplelevel && pgstat_info != NULL)
 	{
-		RegularTabStat.tsa_used++;
-		RegularTabStat.tsa_messages[0]->m_nentries = 0;
+		int		nest_level = GetCurrentTransactionNestLevel();
+
+		/* t_tuples_inserted is nontransactional, so just advance it */
+		pgstat_info->t_counts.t_tuples_inserted++;
+
+		/* We have to log the transactional effect at the proper level */
+		if (pgstat_info->trans == NULL ||
+			pgstat_info->trans->nest_level != nest_level)
+			add_tabstat_xact_level(pgstat_info, nest_level);
+
+		pgstat_info->trans->tuples_inserted++;
 	}
 }
 
-
-/* ----------
- * pgstat_count_xact_rollback() -
- *
- *	Called from access/transam/xact.c to count transaction rollbacks.
- * ----------
+/*
+ * pgstat_count_heap_update - count a tuple update
  */
 void
-pgstat_count_xact_rollback(void)
+pgstat_count_heap_update(Relation rel)
 {
-	if (!(pgstat_collect_querystring ||
-		  pgstat_collect_tuplelevel ||
-		  pgstat_collect_blocklevel))
-		return;
+	PgStat_TableStatus *pgstat_info = rel->pgstat_info;
 
-	pgStatXactRollback++;
+	if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+	{
+		int		nest_level = GetCurrentTransactionNestLevel();
 
-	/*
-	 * If there was no relation activity yet, just make one existing message
-	 * buffer used without slots, causing the next report to tell new
-	 * xact-counters.
-	 */
-	if (RegularTabStat.tsa_alloc == 0)
-		more_tabstat_space(&RegularTabStat);
+		/* t_tuples_updated is nontransactional, so just advance it */
+		pgstat_info->t_counts.t_tuples_updated++;
 
-	if (RegularTabStat.tsa_used == 0)
+		/* We have to log the transactional effect at the proper level */
+		if (pgstat_info->trans == NULL ||
+			pgstat_info->trans->nest_level != nest_level)
+			add_tabstat_xact_level(pgstat_info, nest_level);
+
+		/* An UPDATE both inserts a new tuple and deletes the old */
+		pgstat_info->trans->tuples_inserted++;
+		pgstat_info->trans->tuples_deleted++;
+	}
+}
+
+/*
+ * pgstat_count_heap_delete - count a tuple deletion
+ */
+void
+pgstat_count_heap_delete(Relation rel)
+{
+	PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+	if (pgstat_collect_tuplelevel && pgstat_info != NULL)
 	{
-		RegularTabStat.tsa_used++;
-		RegularTabStat.tsa_messages[0]->m_nentries = 0;
+		int		nest_level = GetCurrentTransactionNestLevel();
+
+		/* t_tuples_deleted is nontransactional, so just advance it */
+		pgstat_info->t_counts.t_tuples_deleted++;
+
+		/* We have to log the transactional effect at the proper level */
+		if (pgstat_info->trans == NULL ||
+			pgstat_info->trans->nest_level != nest_level)
+			add_tabstat_xact_level(pgstat_info, nest_level);
+
+		pgstat_info->trans->tuples_deleted++;
 	}
 }
 
 
 /* ----------
- * pgstat_fetch_stat_dbentry() -
+ * AtEOXact_PgStat
  *
- *	Support function for the SQL-callable pgstat* functions. Returns
- *	the collected statistics for one database or NULL. NULL doesn't mean
- *	that the database doesn't exist, it is just not yet known by the
- *	collector, so the caller is better off to report ZERO instead.
+ *	Called from access/transam/xact.c at top-level transaction commit/abort.
  * ----------
  */
-PgStat_StatDBEntry *
-pgstat_fetch_stat_dbentry(Oid dbid)
+void
+AtEOXact_PgStat(bool isCommit)
 {
+	PgStat_SubXactStatus *xact_state;
+
 	/*
-	 * If not done for this transaction, read the statistics collector stats
-	 * file into some hash tables.
+	 * Count transaction commit or abort.  (We use counters, not just bools,
+	 * in case the reporting message isn't sent right away.)
 	 */
-	backend_read_statsfile();
+	if (isCommit)
+		pgStatXactCommit++;
+	else
+		pgStatXactRollback++;
 
 	/*
-	 * Lookup the requested database; return NULL if not found
+	 * Transfer transactional insert/update counts into the base tabstat
+	 * entries.  We don't bother to free any of the transactional state,
+	 * since it's all in TopTransactionContext and will go away anyway.
 	 */
-	return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-											  (void *) &dbid,
-											  HASH_FIND, NULL);
-}
+	xact_state = pgStatXactStack;
+	if (xact_state != NULL)
+	{
+		PgStat_TableXactStatus *trans;
+
+		Assert(xact_state->nest_level == 1);
+		Assert(xact_state->prev == NULL);
+		for (trans = xact_state->first; trans != NULL; trans = trans->next)
+		{
+			PgStat_TableStatus *tabstat;
+
+			Assert(trans->nest_level == 1);
+			Assert(trans->upper == NULL);
+			tabstat = trans->parent;
+			Assert(tabstat->trans == trans);
+			if (isCommit)
+			{
+				tabstat->t_counts.t_new_live_tuples += trans->tuples_inserted;
+				tabstat->t_counts.t_new_dead_tuples += trans->tuples_deleted;
+			}
+			else
+			{
+				/* inserted tuples are dead, deleted tuples are unaffected */
+				tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
+			}
+			tabstat->trans = NULL;
+		}
+	}
+	pgStatXactStack = NULL;
 
+	/* Make sure any stats snapshot is thrown away */
+	pgstat_clear_snapshot();
+}
 
 /* ----------
- * pgstat_fetch_stat_tabentry() -
+ * AtEOSubXact_PgStat
  *
- *	Support function for the SQL-callable pgstat* functions. Returns
- *	the collected statistics for one table or NULL. NULL doesn't mean
- *	that the table doesn't exist, it is just not yet known by the
- *	collector, so the caller is better off to report ZERO instead.
+ *	Called from access/transam/xact.c at subtransaction commit/abort.
  * ----------
  */
-PgStat_StatTabEntry *
-pgstat_fetch_stat_tabentry(Oid relid)
+void
+AtEOSubXact_PgStat(bool isCommit, int nestDepth)
 {
-	Oid			dbid;
-	PgStat_StatDBEntry *dbentry;
-	PgStat_StatTabEntry *tabentry;
+	PgStat_SubXactStatus *xact_state;
 
 	/*
-	 * If not done for this transaction, read the statistics collector stats
-	 * file into some hash tables.
+	 * Transfer transactional insert/update counts into the next higher
+	 * subtransaction state.
 	 */
-	backend_read_statsfile();
+	xact_state = pgStatXactStack;
+	if (xact_state != NULL &&
+		xact_state->nest_level >= nestDepth)
+	{
+		PgStat_TableXactStatus *trans;
+		PgStat_TableXactStatus *next_trans;
 
-	/*
-	 * Lookup our database, then look in its table hash table.
-	 */
-	dbid = MyDatabaseId;
-	dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-												 (void *) &dbid,
-												 HASH_FIND, NULL);
+		/* delink xact_state from stack immediately to simplify reuse case */
+		pgStatXactStack = xact_state->prev;
+
+		for (trans = xact_state->first; trans != NULL; trans = next_trans)
+		{
+			PgStat_TableStatus *tabstat;
+
+			next_trans = trans->next;
+			Assert(trans->nest_level == nestDepth);
+			tabstat = trans->parent;
+			Assert(tabstat->trans == trans);
+			if (isCommit)
+			{
+				if (trans->upper && trans->upper->nest_level == nestDepth - 1)
+				{
+					trans->upper->tuples_inserted += trans->tuples_inserted;
+					trans->upper->tuples_deleted += trans->tuples_deleted;
+					tabstat->trans = trans->upper;
+					pfree(trans);
+				}
+				else
+				{
+					/*
+					 * When there isn't an immediate parent state, we can
+					 * just reuse the record instead of going through a
+					 * palloc/pfree pushup (this works since it's all in
+					 * TopTransactionContext anyway).  We have to re-link
+					 * it into the parent level, though, and that might mean
+					 * pushing a new entry into the pgStatXactStack.
+					 */
+					PgStat_SubXactStatus *upper_xact_state;
+
+					upper_xact_state = get_tabstat_stack_level(nestDepth - 1);
+					trans->next = upper_xact_state->first;
+					upper_xact_state->first = trans;
+					trans->nest_level = nestDepth - 1;
+				}
+			}
+			else
+			{
+				/*
+				 * On abort, inserted tuples are dead (and can be bounced out
+				 * to the top-level tabstat), deleted tuples are unaffected
+				 */
+				tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
+				tabstat->trans = trans->upper;
+				pfree(trans);
+			}
+		}
+		pfree(xact_state);
+	}
+}
+
+
+/*
+ * AtPrepare_PgStat
+ *		Save the transactional stats state at 2PC transaction prepare.
+ *
+ * In this phase we just generate 2PC records for all the pending
+ * transaction-dependent stats work.
+ */
+void
+AtPrepare_PgStat(void)
+{
+	PgStat_SubXactStatus *xact_state;
+
+	xact_state = pgStatXactStack;
+	if (xact_state != NULL)
+	{
+		PgStat_TableXactStatus *trans;
+
+		Assert(xact_state->nest_level == 1);
+		Assert(xact_state->prev == NULL);
+		for (trans = xact_state->first; trans != NULL; trans = trans->next)
+		{
+			PgStat_TableStatus *tabstat;
+			TwoPhasePgStatRecord record;
+
+			Assert(trans->nest_level == 1);
+			Assert(trans->upper == NULL);
+			tabstat = trans->parent;
+			Assert(tabstat->trans == trans);
+
+			record.tuples_inserted = trans->tuples_inserted;
+			record.tuples_deleted = trans->tuples_deleted;
+			record.t_id = tabstat->t_id;
+			record.t_shared = tabstat->t_shared;
+
+			RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0,
+								   &record, sizeof(TwoPhasePgStatRecord));
+		}
+	}
+}
+
+/*
+ * PostPrepare_PgStat
+ *		Clean up after successful PREPARE.
+ *
+ * All we need do here is unlink the transaction stats state from the
+ * nontransactional state.  The nontransactional action counts will be
+ * reported to the stats collector immediately, while the effects on live
+ * and dead tuple counts are preserved in the 2PC state file.
+ *
+ * Note: AtEOXact_PgStat is not called during PREPARE.
+ */
+void
+PostPrepare_PgStat(void)
+{
+	PgStat_SubXactStatus *xact_state;
+
+	/*
+	 * We don't bother to free any of the transactional state,
+	 * since it's all in TopTransactionContext and will go away anyway.
+	 */
+	xact_state = pgStatXactStack;
+	if (xact_state != NULL)
+	{
+		PgStat_TableXactStatus *trans;
+
+		for (trans = xact_state->first; trans != NULL; trans = trans->next)
+		{
+			PgStat_TableStatus *tabstat;
+
+			tabstat = trans->parent;
+			tabstat->trans = NULL;
+		}
+	}
+	pgStatXactStack = NULL;
+
+	/* Make sure any stats snapshot is thrown away */
+	pgstat_clear_snapshot();
+}
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state.
+ */
+void
+pgstat_twophase_postcommit(TransactionId xid, uint16 info,
+						   void *recdata, uint32 len)
+{
+	TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+	PgStat_TableStatus *pgstat_info;
+
+	/* Find or create a tabstat entry for the rel */
+	pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+	pgstat_info->t_counts.t_new_live_tuples += rec->tuples_inserted;
+	pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_deleted;
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state, but treat them
+ * as aborted.
+ */
+void
+pgstat_twophase_postabort(TransactionId xid, uint16 info,
+						  void *recdata, uint32 len)
+{
+	TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+	PgStat_TableStatus *pgstat_info;
+
+	/* Find or create a tabstat entry for the rel */
+	pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+	/* inserted tuples are dead, deleted tuples are no-ops */
+	pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_inserted;
+}
+
+
+/* ----------
+ * pgstat_fetch_stat_dbentry() -
+ *
+ *	Support function for the SQL-callable pgstat* functions. Returns
+ *	the collected statistics for one database or NULL. NULL doesn't mean
+ *	that the database doesn't exist, it is just not yet known by the
+ *	collector, so the caller is better off to report ZERO instead.
+ * ----------
+ */
+PgStat_StatDBEntry *
+pgstat_fetch_stat_dbentry(Oid dbid)
+{
+	/*
+	 * If not done for this transaction, read the statistics collector stats
+	 * file into some hash tables.
+	 */
+	backend_read_statsfile();
+
+	/*
+	 * Lookup the requested database; return NULL if not found
+	 */
+	return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
+											  (void *) &dbid,
+											  HASH_FIND, NULL);
+}
+
+
+/* ----------
+ * pgstat_fetch_stat_tabentry() -
+ *
+ *	Support function for the SQL-callable pgstat* functions. Returns
+ *	the collected statistics for one table or NULL. NULL doesn't mean
+ *	that the table doesn't exist, it is just not yet known by the
+ *	collector, so the caller is better off to report ZERO instead.
+ * ----------
+ */
+PgStat_StatTabEntry *
+pgstat_fetch_stat_tabentry(Oid relid)
+{
+	Oid			dbid;
+	PgStat_StatDBEntry *dbentry;
+	PgStat_StatTabEntry *tabentry;
+
+	/*
+	 * If not done for this transaction, read the statistics collector stats
+	 * file into some hash tables.
+	 */
+	backend_read_statsfile();
+
+	/*
+	 * Lookup our database, then look in its table hash table.
+	 */
+	dbid = MyDatabaseId;
+	dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
+												 (void *) &dbid,
+												 HASH_FIND, NULL);
 	if (dbentry != NULL && dbentry->tables != NULL)
 	{
 		tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
@@ -1368,40 +1677,365 @@ pgstat_fetch_stat_tabentry(Oid relid)
  * pgstat_fetch_stat_beentry() -
  *
  *	Support function for the SQL-callable pgstat* functions. Returns
- *	the actual activity slot of one active backend. The caller is
- *	responsible for a check if the actual user is permitted to see
- *	that info (especially the querystring).
+ *	our local copy of the current-activity entry for one backend.
+ *
+ *	NB: caller is responsible for a check if the user is permitted to see
+ *	this info (especially the querystring).
  * ----------
  */
-PgStat_StatBeEntry *
+PgBackendStatus *
 pgstat_fetch_stat_beentry(int beid)
+{
+	pgstat_read_current_status();
+
+	if (beid < 1 || beid > localNumBackends)
+		return NULL;
+
+	return &localBackendStatusTable[beid - 1];
+}
+
+
+/* ----------
+ * pgstat_fetch_stat_numbackends() -
+ *
+ *	Support function for the SQL-callable pgstat* functions. Returns
+ *	the maximum current backend id.
+ * ----------
+ */
+int
+pgstat_fetch_stat_numbackends(void)
+{
+	pgstat_read_current_status();
+
+	return localNumBackends;
+}
+
+/*
+ * ---------
+ * pgstat_fetch_global() -
+ *
+ *  Support function for the SQL-callable pgstat* functions. Returns
+ *  a pointer to the global statistics struct.
+ * ---------
+ */
+PgStat_GlobalStats *
+pgstat_fetch_global(void)
 {
 	backend_read_statsfile();
 
-	if (beid < 1 || beid > pgStatNumBackends)
-		return NULL;
+	return &globalStats;
+}
+
+
+/* ------------------------------------------------------------
+ * Functions for management of the shared-memory PgBackendStatus array
+ * ------------------------------------------------------------
+ */
+
+static PgBackendStatus *BackendStatusArray = NULL;
+static PgBackendStatus *MyBEEntry = NULL;
+
+
+/*
+ * Report shared-memory space needed by CreateSharedBackendStatus.
+ */
+Size
+BackendStatusShmemSize(void)
+{
+	Size		size;
+
+	size = mul_size(sizeof(PgBackendStatus), MaxBackends);
+	return size;
+}
+
+/*
+ * Initialize the shared status array during postmaster startup.
+ */
+void
+CreateSharedBackendStatus(void)
+{
+	Size		size = BackendStatusShmemSize();
+	bool		found;
+
+	/* Create or attach to the shared array */
+	BackendStatusArray = (PgBackendStatus *)
+		ShmemInitStruct("Backend Status Array", size, &found);
+
+	if (!found)
+	{
+		/*
+		 * We're the first - initialize.
+		 */
+		MemSet(BackendStatusArray, 0, size);
+	}
+}
+
+
+/* ----------
+ * pgstat_bestart() -
+ *
+ *	Initialize this backend's entry in the PgBackendStatus array,
+ *	and set up an on-proc-exit hook that will clear it again.
+ *	Called from InitPostgres.  MyBackendId and MyDatabaseId must be set.
+ * ----------
+ */
+void
+pgstat_bestart(void)
+{
+	volatile PgBackendStatus *beentry;
+	TimestampTz proc_start_timestamp;
+	Oid			userid;
+	SockAddr	clientaddr;
+
+	Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends);
+	MyBEEntry = &BackendStatusArray[MyBackendId - 1];
+
+	/*
+	 * To minimize the time spent modifying the entry, fetch all the needed
+	 * data first.
+	 *
+	 * If we have a MyProcPort, use its session start time (for consistency,
+	 * and to save a kernel call).
+	 */
+	if (MyProcPort)
+		proc_start_timestamp = MyProcPort->SessionStartTime;
+	else
+		proc_start_timestamp = GetCurrentTimestamp();
+	userid = GetSessionUserId();
+
+	/*
+	 * We may not have a MyProcPort (eg, if this is the autovacuum process).
+	 * If so, use all-zeroes client address, which is dealt with specially in
+	 * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port.
+	 */
+	if (MyProcPort)
+		memcpy(&clientaddr, &MyProcPort->raddr, sizeof(clientaddr));
+	else
+		MemSet(&clientaddr, 0, sizeof(clientaddr));
+
+	/*
+	 * Initialize my status entry, following the protocol of bumping
+	 * st_changecount before and after; and make sure it's even afterwards. We
+	 * use a volatile pointer here to ensure the compiler doesn't try to get
+	 * cute.
+	 */
+	beentry = MyBEEntry;
+	do
+	{
+		beentry->st_changecount++;
+	} while ((beentry->st_changecount & 1) == 0);
+
+	beentry->st_procpid = MyProcPid;
+	beentry->st_proc_start_timestamp = proc_start_timestamp;
+	beentry->st_activity_start_timestamp = 0;
+	beentry->st_txn_start_timestamp = 0;
+	beentry->st_databaseid = MyDatabaseId;
+	beentry->st_userid = userid;
+	beentry->st_clientaddr = clientaddr;
+	beentry->st_waiting = false;
+	beentry->st_activity[0] = '\0';
+	/* Also make sure the last byte in the string area is always 0 */
+	beentry->st_activity[PGBE_ACTIVITY_SIZE - 1] = '\0';
+
+	beentry->st_changecount++;
+	Assert((beentry->st_changecount & 1) == 0);
+
+	/*
+	 * Set up a process-exit hook to clean up.
+	 */
+	on_shmem_exit(pgstat_beshutdown_hook, 0);
+}
+
+/*
+ * Shut down a single backend's statistics reporting at process exit.
+ *
+ * Flush any remaining statistics counts out to the collector.
+ * Without this, operations triggered during backend exit (such as
+ * temp table deletions) won't be counted.
+ *
+ * Lastly, clear out our entry in the PgBackendStatus array.
+ */
+static void
+pgstat_beshutdown_hook(int code, Datum arg)
+{
+	volatile PgBackendStatus *beentry = MyBEEntry;
+
+	pgstat_report_tabstat(true);
+
+	/*
+	 * Clear my status entry, following the protocol of bumping st_changecount
+	 * before and after.  We use a volatile pointer here to ensure the
+	 * compiler doesn't try to get cute.
+	 */
+	beentry->st_changecount++;
+
+	beentry->st_procpid = 0;	/* mark invalid */
+
+	beentry->st_changecount++;
+	Assert((beentry->st_changecount & 1) == 0);
+}
+
+
+/* ----------
+ * pgstat_report_activity() -
+ *
+ *	Called from tcop/postgres.c to report what the backend is actually doing
+ *	(usually "<IDLE>" or the start of the query to be executed).
+ * ----------
+ */
+void
+pgstat_report_activity(const char *cmd_str)
+{
+	volatile PgBackendStatus *beentry = MyBEEntry;
+	TimestampTz start_timestamp;
+	int			len;
+
+	if (!pgstat_collect_querystring || !beentry)
+		return;
+
+	/*
+	 * To minimize the time spent modifying the entry, fetch all the needed
+	 * data first.
+	 */
+	start_timestamp = GetCurrentStatementStartTimestamp();
+
+	len = strlen(cmd_str);
+	len = pg_mbcliplen(cmd_str, len, PGBE_ACTIVITY_SIZE - 1);
+
+	/*
+	 * Update my status entry, following the protocol of bumping
+	 * st_changecount before and after.  We use a volatile pointer here to
+	 * ensure the compiler doesn't try to get cute.
+	 */
+	beentry->st_changecount++;
+
+	beentry->st_activity_start_timestamp = start_timestamp;
+	memcpy((char *) beentry->st_activity, cmd_str, len);
+	beentry->st_activity[len] = '\0';
+
+	beentry->st_changecount++;
+	Assert((beentry->st_changecount & 1) == 0);
+}
+
+/*
+ * Set the current transaction start timestamp to the specified
+ * value. If there is no current active transaction, this is signified
+ * by 0.
+ */
+void
+pgstat_report_txn_timestamp(TimestampTz tstamp)
+{
+	volatile PgBackendStatus *beentry = MyBEEntry;
+
+	if (!pgstat_collect_querystring || !beentry)
+		return;
+
+	/*
+	 * Update my status entry, following the protocol of bumping
+	 * st_changecount before and after.  We use a volatile pointer
+	 * here to ensure the compiler doesn't try to get cute.
+	 */
+	beentry->st_changecount++;
+	beentry->st_txn_start_timestamp = tstamp;
+	beentry->st_changecount++;
+	Assert((beentry->st_changecount & 1) == 0);
+}
+
+/* ----------
+ * pgstat_report_waiting() -
+ *
+ *	Called from lock manager to report beginning or end of a lock wait.
+ *
+ * NB: this *must* be able to survive being called before MyBEEntry has been
+ * initialized.
+ * ----------
+ */
+void
+pgstat_report_waiting(bool waiting)
+{
+	volatile PgBackendStatus *beentry = MyBEEntry;
+
+	if (!pgstat_collect_querystring || !beentry)
+		return;
+
+	/*
+	 * Since this is a single-byte field in a struct that only this process
+	 * may modify, there seems no need to bother with the st_changecount
+	 * protocol.  The update must appear atomic in any case.
+	 */
+	beentry->st_waiting = waiting;
+}
+
+
+/* ----------
+ * pgstat_read_current_status() -
+ *
+ *	Copy the current contents of the PgBackendStatus array to local memory,
+ *	if not already done in this transaction.
+ * ----------
+ */
+static void
+pgstat_read_current_status(void)
+{
+	volatile PgBackendStatus *beentry;
+	PgBackendStatus *localtable;
+	PgBackendStatus *localentry;
+	int			i;
+
+	Assert(!pgStatRunningInCollector);
+	if (localBackendStatusTable)
+		return;					/* already done */
+
+	pgstat_setup_memcxt();
+
+	localtable = (PgBackendStatus *)
+		MemoryContextAlloc(pgStatLocalContext,
+						   sizeof(PgBackendStatus) * MaxBackends);
+	localNumBackends = 0;
+
+	beentry = BackendStatusArray;
+	localentry = localtable;
+	for (i = 1; i <= MaxBackends; i++)
+	{
+		/*
+		 * Follow the protocol of retrying if st_changecount changes while we
+		 * copy the entry, or if it's odd.  (The check for odd is needed to
+		 * cover the case where we are able to completely copy the entry while
+		 * the source backend is between increment steps.)	We use a volatile
+		 * pointer here to ensure the compiler doesn't try to get cute.
+		 */
+		for (;;)
+		{
+			int			save_changecount = beentry->st_changecount;
+
+			/*
+			 * XXX if PGBE_ACTIVITY_SIZE is really large, it might be best to
+			 * use strcpy not memcpy for copying the activity string?
+			 */
+			memcpy(localentry, (char *) beentry, sizeof(PgBackendStatus));
 
-	return &pgStatBeTable[beid - 1];
-}
+			if (save_changecount == beentry->st_changecount &&
+				(save_changecount & 1) == 0)
+				break;
 
+			/* Make sure we can break out of loop if stuck... */
+			CHECK_FOR_INTERRUPTS();
+		}
 
-/* ----------
- * pgstat_fetch_stat_numbackends() -
- *
- *	Support function for the SQL-callable pgstat* functions. Returns
- *	the maximum current backend id.
- * ----------
- */
-int
-pgstat_fetch_stat_numbackends(void)
-{
-	backend_read_statsfile();
+		beentry++;
+		/* Only valid entries get included into the local array */
+		if (localentry->st_procpid > 0)
+		{
+			localentry++;
+			localNumBackends++;
+		}
+	}
 
-	return pgStatNumBackends;
+	/* Set the pointer only after completion of a valid table */
+	localBackendStatusTable = localtable;
 }
 
 
-
 /* ------------------------------------------------------------
  * Local support functions follow
  * ------------------------------------------------------------
@@ -1418,8 +2052,6 @@ static void
 pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype)
 {
 	hdr->m_type = mtype;
-	hdr->m_backendid = MyBackendId;
-	hdr->m_procpid = MyProcPid;
 }
 
 
@@ -1432,112 +2064,64 @@ pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype)
 static void
 pgstat_send(void *msg, int len)
 {
+	int			rc;
+
 	if (pgStatSock < 0)
 		return;
 
 	((PgStat_MsgHdr *) msg)->m_size = len;
 
+	/* We'll retry after EINTR, but ignore all other failures */
+	do
+	{
+		rc = send(pgStatSock, msg, len, 0);
+	} while (rc < 0 && errno == EINTR);
+
 #ifdef USE_ASSERT_CHECKING
-	if (send(pgStatSock, msg, len, 0) < 0)
+	/* In debug builds, log send failures ... */
+	if (rc < 0)
 		elog(LOG, "could not send to statistics collector: %m");
-#else
-	send(pgStatSock, msg, len, 0);
-	/* We deliberately ignore any error from send() */
 #endif
 }
 
-
 /* ----------
- * PgstatBufferMain() -
+ * pgstat_send_bgwriter() -
  *
- *	Start up the statistics buffer process.  This is the body of the
- *	postmaster child process.
- *
- *	The argc/argv parameters are valid only in EXEC_BACKEND case.
+ *      Send bgwriter statistics to the collector
  * ----------
  */
-NON_EXEC_STATIC void
-PgstatBufferMain(int argc, char *argv[])
+void
+pgstat_send_bgwriter(void)
 {
-	IsUnderPostmaster = true;	/* we are a postmaster subprocess now */
-
-	MyProcPid = getpid();		/* reset MyProcPid */
-
-	/* Lose the postmaster's on-exit routines */
-	on_exit_reset();
+	/* We assume this initializes to zeroes */
+	static const PgStat_MsgBgWriter all_zeroes;
 
 	/*
-	 * Ignore all signals usually bound to some action in the postmaster,
-	 * except for SIGCHLD and SIGQUIT --- see pgstat_recvbuffer.
+	 * This function can be called even if nothing at all has happened.
+	 * In this case, avoid sending a completely empty message to
+	 * the stats collector.
 	 */
-	pqsignal(SIGHUP, SIG_IGN);
-	pqsignal(SIGINT, SIG_IGN);
-	pqsignal(SIGTERM, SIG_IGN);
-	pqsignal(SIGQUIT, pgstat_exit);
-	pqsignal(SIGALRM, SIG_IGN);
-	pqsignal(SIGPIPE, SIG_IGN);
-	pqsignal(SIGUSR1, SIG_IGN);
-	pqsignal(SIGUSR2, SIG_IGN);
-	pqsignal(SIGCHLD, pgstat_die);
-	pqsignal(SIGTTIN, SIG_DFL);
-	pqsignal(SIGTTOU, SIG_DFL);
-	pqsignal(SIGCONT, SIG_DFL);
-	pqsignal(SIGWINCH, SIG_DFL);
-	/* unblock will happen in pgstat_recvbuffer */
-
-#ifdef EXEC_BACKEND
-	pgstat_parseArgs(argc, argv);
-#endif
+	if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
+		return;
 
 	/*
-	 * Start a buffering process to read from the socket, so we have a little
-	 * more time to process incoming messages.
-	 *
-	 * NOTE: the process structure is: postmaster is parent of buffer process
-	 * is parent of collector process.	This way, the buffer can detect
-	 * collector failure via SIGCHLD, whereas otherwise it wouldn't notice
-	 * collector failure until it tried to write on the pipe.  That would mean
-	 * that after the postmaster started a new collector, we'd have two buffer
-	 * processes competing to read from the UDP socket --- not good.
-	 */
-	if (pgpipe(pgStatPipe) < 0)
-		ereport(ERROR,
-				(errcode_for_socket_access(),
-				 errmsg("could not create pipe for statistics buffer: %m")));
-
-	/* child becomes collector process */
-#ifdef EXEC_BACKEND
-	pgStatCollectorPid = pgstat_forkexec(STAT_PROC_COLLECTOR);
-#else
-	pgStatCollectorPid = fork();
-#endif
-	switch (pgStatCollectorPid)
-	{
-		case -1:
-			ereport(ERROR,
-					(errmsg("could not fork statistics collector: %m")));
-
-#ifndef EXEC_BACKEND
-		case 0:
-			/* child becomes collector process */
-			PgstatCollectorMain(0, NULL);
-			break;
-#endif
+	 * Prepare and send the message
+	 */
+	pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
+	pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
 
-		default:
-			/* parent becomes buffer process */
-			closesocket(pgStatPipe[0]);
-			pgstat_recvbuffer();
-	}
-	exit(0);
+	/*
+	 * Clear out the statistics buffer, so it can be re-used.
+	 */
+	MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
 }
 
 
 /* ----------
  * PgstatCollectorMain() -
  *
- *	Start up the statistics collector itself.  This is the body of the
- *	postmaster grandchild process.
+ *	Start up the statistics collector process.	This is the body of the
+ *	postmaster child process.
  *
  *	The argc/argv parameters are valid only in EXEC_BACKEND case.
  * ----------
@@ -1545,34 +2129,44 @@ PgstatBufferMain(int argc, char *argv[])
 NON_EXEC_STATIC void
 PgstatCollectorMain(int argc, char *argv[])
 {
+	struct itimerval write_timeout;
+	bool		need_timer = false;
+	int			len;
 	PgStat_Msg	msg;
+
+#ifndef WIN32
+#ifdef HAVE_POLL
+	struct pollfd input_fd;
+#else
+	struct timeval sel_timeout;
 	fd_set		rfds;
-	int			readPipe;
-	int			nready;
-	int			len = 0;
-	struct timeval timeout;
-	struct timeval next_statwrite;
-	bool		need_statwrite;
-	HASHCTL		hash_ctl;
+#endif
+#endif
+
+	IsUnderPostmaster = true;	/* we are a postmaster subprocess now */
 
 	MyProcPid = getpid();		/* reset MyProcPid */
 
 	/*
-	 * Reset signal handling.  With the exception of restoring default SIGCHLD
-	 * and SIGQUIT handling, this is a no-op in the non-EXEC_BACKEND case
-	 * because we'll have inherited these settings from the buffer process;
-	 * but it's not a no-op for EXEC_BACKEND.
+	 * If possible, make this process a group leader, so that the postmaster
+	 * can signal any child processes too.  (pgstat probably never has
+	 * any child processes, but for consistency we make all postmaster
+	 * child processes do this.)
+	 */
+#ifdef HAVE_SETSID
+	if (setsid() < 0)
+		elog(FATAL, "setsid() failed: %m");
+#endif
+
+	/*
+	 * Ignore all signals usually bound to some action in the postmaster,
+	 * except SIGQUIT and SIGALRM.
 	 */
 	pqsignal(SIGHUP, SIG_IGN);
 	pqsignal(SIGINT, SIG_IGN);
 	pqsignal(SIGTERM, SIG_IGN);
-#ifndef WIN32
-	pqsignal(SIGQUIT, SIG_IGN);
-#else
-	/* kluge to allow buffer process to kill collector; FIXME */
 	pqsignal(SIGQUIT, pgstat_exit);
-#endif
-	pqsignal(SIGALRM, SIG_IGN);
+	pqsignal(SIGALRM, force_statwrite);
 	pqsignal(SIGPIPE, SIG_IGN);
 	pqsignal(SIGUSR1, SIG_IGN);
 	pqsignal(SIGUSR2, SIG_IGN);
@@ -1583,399 +2177,141 @@ PgstatCollectorMain(int argc, char *argv[])
 	pqsignal(SIGWINCH, SIG_DFL);
 	PG_SETMASK(&UnBlockSig);
 
-#ifdef EXEC_BACKEND
-	pgstat_parseArgs(argc, argv);
-#endif
-
-	/* Close unwanted files */
-	closesocket(pgStatPipe[1]);
-	closesocket(pgStatSock);
-
 	/*
 	 * Identify myself via ps
 	 */
-	init_ps_display("stats collector process", "", "");
-	set_ps_display("");
+	init_ps_display("stats collector process", "", "", "");
 
 	/*
 	 * Arrange to write the initial status file right away
 	 */
-	gettimeofday(&next_statwrite, NULL);
-	need_statwrite = TRUE;
+	need_statwrite = true;
+
+	/* Preset the delay between status file writes */
+	MemSet(&write_timeout, 0, sizeof(struct itimerval));
+	write_timeout.it_value.tv_sec = PGSTAT_STAT_INTERVAL / 1000;
+	write_timeout.it_value.tv_usec = (PGSTAT_STAT_INTERVAL % 1000) * 1000;
 
 	/*
 	 * Read in an existing statistics stats file or initialize the stats to
 	 * zero.
 	 */
-	pgStatRunningInCollector = TRUE;
-	pgstat_read_statsfile(&pgStatDBHash, InvalidOid, NULL, NULL);
-
-	/*
-	 * Create the dead backend hashtable
-	 */
-	memset(&hash_ctl, 0, sizeof(hash_ctl));
-	hash_ctl.keysize = sizeof(int);
-	hash_ctl.entrysize = sizeof(PgStat_StatBeDead);
-	hash_ctl.hash = tag_hash;
-	pgStatBeDead = hash_create("Dead Backends", PGSTAT_BE_HASH_SIZE,
-							   &hash_ctl, HASH_ELEM | HASH_FUNCTION);
+	pgStatRunningInCollector = true;
+	pgStatDBHash = pgstat_read_statsfile(InvalidOid);
 
 	/*
-	 * Create the known backends table
+	 * Setup the descriptor set for select(2).	Since only one bit in the set
+	 * ever changes, we need not repeat FD_ZERO each time.
 	 */
-	pgStatBeTable = (PgStat_StatBeEntry *)
-		palloc0(sizeof(PgStat_StatBeEntry) * MaxBackends);
-
-	readPipe = pgStatPipe[0];
+#if !defined(HAVE_POLL) && !defined(WIN32)
+	FD_ZERO(&rfds);
+#endif
 
 	/*
-	 * Process incoming messages and handle all the reporting stuff until
-	 * there are no more messages.
+	 * Loop to process messages until we get SIGQUIT or detect ungraceful
+	 * death of our parent postmaster.
+	 *
+	 * For performance reasons, we don't want to do a PostmasterIsAlive() test
+	 * after every message; instead, do it at statwrite time and if
+	 * select()/poll() is interrupted by timeout.
 	 */
 	for (;;)
 	{
+		int			got_data;
+
+		/*
+		 * Quit if we get SIGQUIT from the postmaster.
+		 */
+		if (need_exit)
+			break;
+
 		/*
-		 * If we need to write the status file again (there have been changes
-		 * in the statistics since we wrote it last) calculate the timeout
-		 * until we have to do so.
+		 * If time to write the stats file, do so.	Note that the alarm
+		 * interrupt isn't re-enabled immediately, but only after we next
+		 * receive a stats message; so no cycles are wasted when there is
+		 * nothing going on.
 		 */
 		if (need_statwrite)
 		{
-			struct timeval now;
+			/* Check for postmaster death; if so we'll write file below */
+			if (!PostmasterIsAlive(true))
+				break;
 
-			gettimeofday(&now, NULL);
-			/* avoid assuming that tv_sec is signed */
-			if (now.tv_sec > next_statwrite.tv_sec ||
-				(now.tv_sec == next_statwrite.tv_sec &&
-				 now.tv_usec >= next_statwrite.tv_usec))
-			{
-				timeout.tv_sec = 0;
-				timeout.tv_usec = 0;
-			}
-			else
-			{
-				timeout.tv_sec = next_statwrite.tv_sec - now.tv_sec;
-				timeout.tv_usec = next_statwrite.tv_usec - now.tv_usec;
-				if (timeout.tv_usec < 0)
-				{
-					timeout.tv_sec--;
-					timeout.tv_usec += 1000000;
-				}
-			}
+			pgstat_write_statsfile();
+			need_statwrite = false;
+			need_timer = true;
 		}
 
 		/*
-		 * Setup the descriptor set for select(2)
+		 * Wait for a message to arrive; but not for more than
+		 * PGSTAT_SELECT_TIMEOUT seconds. (This determines how quickly we will
+		 * shut down after an ungraceful postmaster termination; so it needn't
+		 * be very fast.  However, on some systems SIGQUIT won't interrupt the
+		 * poll/select call, so this also limits speed of response to SIGQUIT,
+		 * which is more important.)
+		 *
+		 * We use poll(2) if available, otherwise select(2).
+		 * Win32 has its own implementation.
 		 */
-		FD_ZERO(&rfds);
-		FD_SET(readPipe, &rfds);
+#ifndef WIN32
+#ifdef HAVE_POLL
+		input_fd.fd = pgStatSock;
+		input_fd.events = POLLIN | POLLERR;
+		input_fd.revents = 0;
 
-		/*
-		 * Now wait for something to do.
-		 */
-		nready = select(readPipe + 1, &rfds, NULL, NULL,
-						(need_statwrite) ? &timeout : NULL);
-		if (nready < 0)
+		if (poll(&input_fd, 1, PGSTAT_SELECT_TIMEOUT * 1000) < 0)
 		{
 			if (errno == EINTR)
 				continue;
 			ereport(ERROR,
 					(errcode_for_socket_access(),
-					 errmsg("select() failed in statistics collector: %m")));
-		}
-
-		/*
-		 * If there are no descriptors ready, our timeout for writing the
-		 * stats file happened.
-		 */
-		if (nready == 0)
-		{
-			pgstat_write_statsfile();
-			need_statwrite = FALSE;
-
-			continue;
-		}
-
-		/*
-		 * Check if there is a new statistics message to collect.
-		 */
-		if (FD_ISSET(readPipe, &rfds))
-		{
-			/*
-			 * We may need to issue multiple read calls in case the buffer
-			 * process didn't write the message in a single write, which is
-			 * possible since it dumps its buffer bytewise. In any case, we'd
-			 * need two reads since we don't know the message length
-			 * initially.
-			 */
-			int			nread = 0;
-			int			targetlen = sizeof(PgStat_MsgHdr);		/* initial */
-			bool		pipeEOF = false;
-
-			while (nread < targetlen)
-			{
-				len = piperead(readPipe, ((char *) &msg) + nread,
-							   targetlen - nread);
-				if (len < 0)
-				{
-					if (errno == EINTR)
-						continue;
-					ereport(ERROR,
-							(errcode_for_socket_access(),
-							 errmsg("could not read from statistics collector pipe: %m")));
-				}
-				if (len == 0)	/* EOF on the pipe! */
-				{
-					pipeEOF = true;
-					break;
-				}
-				nread += len;
-				if (nread == sizeof(PgStat_MsgHdr))
-				{
-					/* we have the header, compute actual msg length */
-					targetlen = msg.msg_hdr.m_size;
-					if (targetlen < (int) sizeof(PgStat_MsgHdr) ||
-						targetlen > (int) sizeof(msg))
-					{
-						/*
-						 * Bogus message length implies that we got out of
-						 * sync with the buffer process somehow. Abort so that
-						 * we can restart both processes.
-						 */
-						ereport(ERROR,
-							  (errmsg("invalid statistics message length")));
-					}
-				}
-			}
-
-			/*
-			 * EOF on the pipe implies that the buffer process exited. Fall
-			 * out of outer loop.
-			 */
-			if (pipeEOF)
-				break;
-
-			/*
-			 * Distribute the message to the specific function handling it.
-			 */
-			switch (msg.msg_hdr.m_type)
-			{
-				case PGSTAT_MTYPE_DUMMY:
-					break;
-
-				case PGSTAT_MTYPE_BESTART:
-					pgstat_recv_bestart((PgStat_MsgBestart *) &msg, nread);
-					break;
-
-				case PGSTAT_MTYPE_BETERM:
-					pgstat_recv_beterm((PgStat_MsgBeterm *) &msg, nread);
-					break;
-
-				case PGSTAT_MTYPE_TABSTAT:
-					pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, nread);
-					break;
-
-				case PGSTAT_MTYPE_TABPURGE:
-					pgstat_recv_tabpurge((PgStat_MsgTabpurge *) &msg, nread);
-					break;
-
-				case PGSTAT_MTYPE_ACTIVITY:
-					pgstat_recv_activity((PgStat_MsgActivity *) &msg, nread);
-					break;
-
-				case PGSTAT_MTYPE_DROPDB:
-					pgstat_recv_dropdb((PgStat_MsgDropdb *) &msg, nread);
-					break;
-
-				case PGSTAT_MTYPE_RESETCOUNTER:
-					pgstat_recv_resetcounter((PgStat_MsgResetcounter *) &msg,
-											 nread);
-					break;
-
-				case PGSTAT_MTYPE_AUTOVAC_START:
-					pgstat_recv_autovac((PgStat_MsgAutovacStart *) &msg, nread);
-					break;
-
-				case PGSTAT_MTYPE_VACUUM:
-					pgstat_recv_vacuum((PgStat_MsgVacuum *) &msg, nread);
-					break;
-
-				case PGSTAT_MTYPE_ANALYZE:
-					pgstat_recv_analyze((PgStat_MsgAnalyze *) &msg, nread);
-					break;
-
-				default:
-					break;
-			}
-
-			/*
-			 * Globally count messages.
-			 */
-			pgStatNumMessages++;
-
-			/*
-			 * If this is the first message after we wrote the stats file the
-			 * last time, setup the timeout that it'd be written.
-			 */
-			if (!need_statwrite)
-			{
-				gettimeofday(&next_statwrite, NULL);
-				next_statwrite.tv_usec += ((PGSTAT_STAT_INTERVAL) * 1000);
-				next_statwrite.tv_sec += (next_statwrite.tv_usec / 1000000);
-				next_statwrite.tv_usec %= 1000000;
-				need_statwrite = TRUE;
-			}
+					 errmsg("poll() failed in statistics collector: %m")));
 		}
 
-		/*
-		 * Note that we do NOT check for postmaster exit inside the loop; only
-		 * EOF on the buffer pipe causes us to fall out.  This ensures we
-		 * don't exit prematurely if there are still a few messages in the
-		 * buffer or pipe at postmaster shutdown.
-		 */
-	}
-
-	/*
-	 * Okay, we saw EOF on the buffer pipe, so there are no more messages to
-	 * process.  If the buffer process quit because of postmaster shutdown, we
-	 * want to save the final stats to reuse at next startup. But if the
-	 * buffer process failed, it seems best not to (there may even now be a
-	 * new collector firing up, and we don't want it to read a
-	 * partially-rewritten stats file).
-	 */
-	if (!PostmasterIsAlive(false))
-		pgstat_write_statsfile();
-}
-
-
-/* ----------
- * pgstat_recvbuffer() -
- *
- *	This is the body of the separate buffering process. Its only
- *	purpose is to receive messages from the UDP socket as fast as
- *	possible and forward them over a pipe into the collector itself.
- *	If the collector is slow to absorb messages, they are buffered here.
- * ----------
- */
-static void
-pgstat_recvbuffer(void)
-{
-	fd_set		rfds;
-	fd_set		wfds;
-	struct timeval timeout;
-	int			writePipe = pgStatPipe[1];
-	int			maxfd;
-	int			nready;
-	int			len;
-	int			xfr;
-	int			frm;
-	PgStat_Msg	input_buffer;
-	char	   *msgbuffer;
-	int			msg_send = 0;	/* next send index in buffer */
-	int			msg_recv = 0;	/* next receive index */
-	int			msg_have = 0;	/* number of bytes stored */
-	bool		overflow = false;
-
-	/*
-	 * Identify myself via ps
-	 */
-	init_ps_display("stats buffer process", "", "");
-	set_ps_display("");
-
-	/*
-	 * We want to die if our child collector process does.	There are two ways
-	 * we might notice that it has died: receive SIGCHLD, or get a write
-	 * failure on the pipe leading to the child.  We can set SIGPIPE to kill
-	 * us here.  Our SIGCHLD handler was already set up before we forked (must
-	 * do it that way, else it's a race condition).
-	 */
-	pqsignal(SIGPIPE, SIG_DFL);
-	PG_SETMASK(&UnBlockSig);
-
-	/*
-	 * Set the write pipe to nonblock mode, so that we cannot block when the
-	 * collector falls behind.
-	 */
-	if (!pg_set_noblock(writePipe))
-		ereport(ERROR,
-				(errcode_for_socket_access(),
-				 errmsg("could not set statistics collector pipe to nonblocking mode: %m")));
-
-	/*
-	 * Allocate the message buffer
-	 */
-	msgbuffer = (char *) palloc(PGSTAT_RECVBUFFERSZ);
-
-	/*
-	 * Loop forever
-	 */
-	for (;;)
-	{
-		FD_ZERO(&rfds);
-		FD_ZERO(&wfds);
-		maxfd = -1;
-
-		/*
-		 * As long as we have buffer space we add the socket to the read
-		 * descriptor set.
-		 */
-		if (msg_have <= (int) (PGSTAT_RECVBUFFERSZ - sizeof(PgStat_Msg)))
-		{
-			FD_SET(pgStatSock, &rfds);
-			maxfd = pgStatSock;
-			overflow = false;
-		}
-		else
-		{
-			if (!overflow)
-			{
-				ereport(LOG,
-						(errmsg("statistics buffer is full")));
-				overflow = true;
-			}
-		}
+		got_data = (input_fd.revents != 0);
+#else							/* !HAVE_POLL */
 
-		/*
-		 * If we have messages to write out, we add the pipe to the write
-		 * descriptor set.
-		 */
-		if (msg_have > 0)
-		{
-			FD_SET(writePipe, &wfds);
-			if (writePipe > maxfd)
-				maxfd = writePipe;
-		}
+		FD_SET(pgStatSock, &rfds);
 
 		/*
-		 * Wait for some work to do; but not for more than 10 seconds. (This
-		 * determines how quickly we will shut down after an ungraceful
-		 * postmaster termination; so it needn't be very fast.)
+		 * timeout struct is modified by select() on some operating systems,
+		 * so re-fill it each time.
 		 */
-		timeout.tv_sec = 10;
-		timeout.tv_usec = 0;
+		sel_timeout.tv_sec = PGSTAT_SELECT_TIMEOUT;
+		sel_timeout.tv_usec = 0;
 
-		nready = select(maxfd + 1, &rfds, &wfds, NULL, &timeout);
-		if (nready < 0)
+		if (select(pgStatSock + 1, &rfds, NULL, NULL, &sel_timeout) < 0)
 		{
 			if (errno == EINTR)
 				continue;
 			ereport(ERROR,
 					(errcode_for_socket_access(),
-					 errmsg("select() failed in statistics buffer: %m")));
+					 errmsg("select() failed in statistics collector: %m")));
 		}
 
+		got_data = FD_ISSET(pgStatSock, &rfds);
+#endif   /* HAVE_POLL */
+#else /* WIN32 */
+		got_data = pgwin32_waitforsinglesocket(pgStatSock, FD_READ,
+											   PGSTAT_SELECT_TIMEOUT*1000);
+#endif
+
 		/*
 		 * If there is a message on the socket, read it and check for
 		 * validity.
 		 */
-		if (FD_ISSET(pgStatSock, &rfds))
+		if (got_data)
 		{
-			len = recv(pgStatSock, (char *) &input_buffer,
+			len = recv(pgStatSock, (char *) &msg,
 					   sizeof(PgStat_Msg), 0);
 			if (len < 0)
+			{
+				if (errno == EINTR)
+					continue;
 				ereport(ERROR,
 						(errcode_for_socket_access(),
 						 errmsg("could not read statistics message: %m")));
+			}
 
 			/*
 			 * We ignore messages that are smaller than our common header
@@ -1986,193 +2322,102 @@ pgstat_recvbuffer(void)
 			/*
 			 * The received length must match the length in the header
 			 */
-			if (input_buffer.msg_hdr.m_size != len)
+			if (msg.msg_hdr.m_size != len)
 				continue;
 
 			/*
-			 * O.K. - we accept this message.  Copy it to the circular
-			 * msgbuffer.
+			 * O.K. - we accept this message.  Process it.
 			 */
-			frm = 0;
-			while (len > 0)
-			{
-				xfr = PGSTAT_RECVBUFFERSZ - msg_recv;
-				if (xfr > len)
-					xfr = len;
-				Assert(xfr > 0);
-				memcpy(msgbuffer + msg_recv,
-					   ((char *) &input_buffer) + frm,
-					   xfr);
-				msg_recv += xfr;
-				if (msg_recv == PGSTAT_RECVBUFFERSZ)
-					msg_recv = 0;
-				msg_have += xfr;
-				frm += xfr;
-				len -= xfr;
-			}
-		}
-
-		/*
-		 * If the collector is ready to receive, write some data into his
-		 * pipe.  We may or may not be able to write all that we have.
-		 *
-		 * NOTE: if what we have is less than PIPE_BUF bytes but more than the
-		 * space available in the pipe buffer, most kernels will refuse to
-		 * write any of it, and will return EAGAIN.  This means we will
-		 * busy-loop until the situation changes (either because the collector
-		 * caught up, or because more data arrives so that we have more than
-		 * PIPE_BUF bytes buffered).  This is not good, but is there any way
-		 * around it?  We have no way to tell when the collector has caught
-		 * up...
-		 */
-		if (FD_ISSET(writePipe, &wfds))
-		{
-			xfr = PGSTAT_RECVBUFFERSZ - msg_send;
-			if (xfr > msg_have)
-				xfr = msg_have;
-			Assert(xfr > 0);
-			len = pipewrite(writePipe, msgbuffer + msg_send, xfr);
-			if (len < 0)
+			switch (msg.msg_hdr.m_type)
 			{
-				if (errno == EINTR || errno == EAGAIN)
-					continue;	/* not enough space in pipe */
-				ereport(ERROR,
-						(errcode_for_socket_access(),
-				errmsg("could not write to statistics collector pipe: %m")));
-			}
-			/* NB: len < xfr is okay */
-			msg_send += len;
-			if (msg_send == PGSTAT_RECVBUFFERSZ)
-				msg_send = 0;
-			msg_have -= len;
-		}
-
-		/*
-		 * Make sure we forwarded all messages before we check for postmaster
-		 * termination.
-		 */
-		if (msg_have != 0 || FD_ISSET(pgStatSock, &rfds))
-			continue;
+				case PGSTAT_MTYPE_DUMMY:
+					break;
 
-		/*
-		 * If the postmaster has terminated, we die too.  (This is no longer
-		 * the normal exit path, however.)
-		 */
-		if (!PostmasterIsAlive(true))
-			exit(0);
-	}
-}
+				case PGSTAT_MTYPE_TABSTAT:
+					pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, len);
+					break;
 
-/* SIGQUIT signal handler for buffer process */
-static void
-pgstat_exit(SIGNAL_ARGS)
-{
-	/*
-	 * For now, we just nail the doors shut and get out of town.  It might be
-	 * cleaner to allow any pending messages to be sent, but that creates a
-	 * tradeoff against speed of exit.
-	 */
+				case PGSTAT_MTYPE_TABPURGE:
+					pgstat_recv_tabpurge((PgStat_MsgTabpurge *) &msg, len);
+					break;
 
-	/*
-	 * If running in bufferer, kill our collector as well. On some broken
-	 * win32 systems, it does not shut down automatically because of issues
-	 * with socket inheritance.  XXX so why not fix the socket inheritance...
-	 */
-#ifdef WIN32
-	if (pgStatCollectorPid > 0)
-		kill(pgStatCollectorPid, SIGQUIT);
-#endif
-	exit(0);
-}
+				case PGSTAT_MTYPE_DROPDB:
+					pgstat_recv_dropdb((PgStat_MsgDropdb *) &msg, len);
+					break;
 
-/* SIGCHLD signal handler for buffer process */
-static void
-pgstat_die(SIGNAL_ARGS)
-{
-	exit(1);
-}
+				case PGSTAT_MTYPE_RESETCOUNTER:
+					pgstat_recv_resetcounter((PgStat_MsgResetcounter *) &msg,
+											 len);
+					break;
 
+				case PGSTAT_MTYPE_AUTOVAC_START:
+					pgstat_recv_autovac((PgStat_MsgAutovacStart *) &msg, len);
+					break;
 
-/* ----------
- * pgstat_add_backend() -
- *
- *	Support function to keep our backend list up to date.
- * ----------
- */
-static int
-pgstat_add_backend(PgStat_MsgHdr *msg)
-{
-	PgStat_StatBeEntry *beentry;
-	PgStat_StatBeDead *deadbe;
+				case PGSTAT_MTYPE_VACUUM:
+					pgstat_recv_vacuum((PgStat_MsgVacuum *) &msg, len);
+					break;
 
-	/*
-	 * Check that the backend ID is valid
-	 */
-	if (msg->m_backendid < 1 || msg->m_backendid > MaxBackends)
-	{
-		ereport(LOG,
-				(errmsg("invalid server process ID %d", msg->m_backendid)));
-		return -1;
-	}
+				case PGSTAT_MTYPE_ANALYZE:
+					pgstat_recv_analyze((PgStat_MsgAnalyze *) &msg, len);
+					break;
 
-	/*
-	 * Get the slot for this backendid.
-	 */
-	beentry = &pgStatBeTable[msg->m_backendid - 1];
+				case PGSTAT_MTYPE_BGWRITER:
+					pgstat_recv_bgwriter((PgStat_MsgBgWriter *) &msg, len);
+					break;
 
-	/*
-	 * If the slot contains the PID of this backend, everything is fine and we
-	 * have nothing to do. Note that all the slots are zero'd out when the
-	 * collector is started. We assume that a slot is "empty" iff procpid ==
-	 * 0.
-	 */
-	if (beentry->procpid > 0 && beentry->procpid == msg->m_procpid)
-		return 0;
+				default:
+					break;
+			}
 
-	/*
-	 * Lookup if this backend is known to be dead. This can be caused due to
-	 * messages arriving in the wrong order - e.g. postmaster's BETERM message
-	 * might have arrived before we received all the backends stats messages,
-	 * or even a new backend with the same backendid was faster in sending his
-	 * BESTART.
-	 *
-	 * If the backend is known to be dead, we ignore this add.
-	 */
-	deadbe = (PgStat_StatBeDead *) hash_search(pgStatBeDead,
-											   (void *) &(msg->m_procpid),
-											   HASH_FIND, NULL);
-	if (deadbe)
-		return 1;
+			/*
+			 * If this is the first message after we wrote the stats file the
+			 * last time, enable the alarm interrupt to make it be written
+			 * again later.
+			 */
+			if (need_timer)
+			{
+				if (setitimer(ITIMER_REAL, &write_timeout, NULL))
+					ereport(ERROR,
+					(errmsg("could not set statistics collector timer: %m")));
+				need_timer = false;
+			}
+		}
+		else
+		{
+			/*
+			 * We can only get here if the select/poll timeout elapsed. Check
+			 * for postmaster death.
+			 */
+			if (!PostmasterIsAlive(true))
+				break;
+		}
+	}							/* end of message-processing loop */
 
 	/*
-	 * Backend isn't known to be dead. If it's slot is currently used, we have
-	 * to kick out the old backend.
+	 * Save the final stats to reuse at next startup.
 	 */
-	if (beentry->procpid > 0)
-		pgstat_sub_backend(beentry->procpid);
+	pgstat_write_statsfile();
 
-	/* Must be able to distinguish between empty and non-empty slots */
-	Assert(msg->m_procpid > 0);
+	exit(0);
+}
 
-	/* Put this new backend into the slot */
-	beentry->procpid = msg->m_procpid;
-	beentry->start_timestamp = GetCurrentTimestamp();
-	beentry->activity_start_timestamp = 0;
-	beentry->activity[0] = '\0';
 
-	/*
-	 * We can't initialize the rest of the data in this slot until we see the
-	 * BESTART message. Therefore, we set the database and user to sentinel
-	 * values, to indicate "undefined". There is no easy way to do this for
-	 * the client address, so make sure to check that the database or user are
-	 * defined before accessing the client address.
-	 */
-	beentry->userid = InvalidOid;
-	beentry->databaseid = InvalidOid;
+/* SIGQUIT signal handler for collector process */
+static void
+pgstat_exit(SIGNAL_ARGS)
+{
+	need_exit = true;
+}
 
-	return 0;
+/* SIGALRM signal handler for collector process */
+static void
+force_statwrite(SIGNAL_ARGS)
+{
+	need_statwrite = true;
 }
 
+
 /*
  * Lookup the hash table entry for the specified database. If no hash
  * table entry exists, initialize it, if the create parameter is true.
@@ -2203,7 +2448,11 @@ pgstat_get_db_entry(Oid databaseid, bool create)
 		result->n_xact_rollback = 0;
 		result->n_blocks_fetched = 0;
 		result->n_blocks_hit = 0;
-		result->destroy = 0;
+		result->n_tuples_returned = 0;
+		result->n_tuples_fetched = 0;
+		result->n_tuples_inserted = 0;
+		result->n_tuples_updated = 0;
+		result->n_tuples_deleted = 0;
 		result->last_autovac_time = 0;
 
 		memset(&hash_ctl, 0, sizeof(hash_ctl));
@@ -2219,61 +2468,6 @@ pgstat_get_db_entry(Oid databaseid, bool create)
 	return result;
 }
 
-/* ----------
- * pgstat_sub_backend() -
- *
- *	Remove a backend from the actual backends list.
- * ----------
- */
-static void
-pgstat_sub_backend(int procpid)
-{
-	int			i;
-	PgStat_StatBeDead *deadbe;
-	bool		found;
-
-	/*
-	 * Search in the known-backends table for the slot containing this PID.
-	 */
-	for (i = 0; i < MaxBackends; i++)
-	{
-		if (pgStatBeTable[i].procpid == procpid)
-		{
-			/*
-			 * That's him. Add an entry to the known to be dead backends. Due
-			 * to possible misorder in the arrival of UDP packets it's
-			 * possible that even if we know the backend is dead, there could
-			 * still be messages queued that arrive later. Those messages must
-			 * not cause our number of backends statistics to get screwed up,
-			 * so we remember for a couple of seconds that this PID is dead
-			 * and ignore them (only the counting of backends, not the table
-			 * access stats they sent).
-			 */
-			deadbe = (PgStat_StatBeDead *) hash_search(pgStatBeDead,
-													   (void *) &procpid,
-													   HASH_ENTER,
-													   &found);
-
-			if (!found)
-			{
-				deadbe->backendid = i + 1;
-				deadbe->destroy = PGSTAT_DESTROY_COUNT;
-			}
-
-			/*
-			 * Declare the backend slot empty.
-			 */
-			pgStatBeTable[i].procpid = 0;
-			return;
-		}
-	}
-
-	/*
-	 * No big problem if not found. This can happen if UDP messages arrive out
-	 * of order here.
-	 */
-}
-
 
 /* ----------
  * pgstat_write_statsfile() -
@@ -2288,9 +2482,7 @@ pgstat_write_statsfile(void)
 	HASH_SEQ_STATUS tstat;
 	PgStat_StatDBEntry *dbentry;
 	PgStat_StatTabEntry *tabentry;
-	PgStat_StatBeDead *deadbe;
 	FILE	   *fpout;
-	int			i;
 	int32		format_id;
 
 	/*
@@ -2312,6 +2504,11 @@ pgstat_write_statsfile(void)
 	format_id = PGSTAT_FILE_FORMAT_ID;
 	fwrite(&format_id, sizeof(format_id), 1, fpout);
 
+	/*
+	 * Write global stats struct
+	 */
+	fwrite(&globalStats, sizeof(globalStats), 1, fpout);
+
 	/*
 	 * Walk through the database table.
 	 */
@@ -2319,68 +2516,19 @@ pgstat_write_statsfile(void)
 	while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
 	{
 		/*
-		 * If this database is marked destroyed, count down and do so if it
-		 * reaches 0.
-		 */
-		if (dbentry->destroy > 0)
-		{
-			if (--(dbentry->destroy) == 0)
-			{
-				if (dbentry->tables != NULL)
-					hash_destroy(dbentry->tables);
-
-				if (hash_search(pgStatDBHash,
-								(void *) &(dbentry->databaseid),
-								HASH_REMOVE, NULL) == NULL)
-					ereport(ERROR,
-							(errmsg("database hash table corrupted "
-									"during cleanup --- abort")));
-			}
-
-			/*
-			 * Don't include statistics for it.
-			 */
-			continue;
-		}
-
-		/*
-		 * Write out the DB line including the number of live backends.
+		 * Write out the DB entry including the number of live backends. We
+		 * don't write the tables pointer since it's of no use to any other
+		 * process.
 		 */
 		fputc('D', fpout);
-		fwrite(dbentry, sizeof(PgStat_StatDBEntry), 1, fpout);
+		fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
 
 		/*
-		 * Walk through the databases access stats per table.
+		 * Walk through the database's access stats per table.
 		 */
 		hash_seq_init(&tstat, dbentry->tables);
 		while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
 		{
-			/*
-			 * If table entry marked for destruction, same as above for the
-			 * database entry.
-			 */
-			if (tabentry->destroy > 0)
-			{
-				if (--(tabentry->destroy) == 0)
-				{
-					if (hash_search(dbentry->tables,
-									(void *) &(tabentry->tableid),
-									HASH_REMOVE, NULL) == NULL)
-					{
-						ereport(ERROR,
-								(errmsg("tables hash table for "
-										"database %u corrupted during "
-										"cleanup --- abort",
-										dbentry->databaseid)));
-					}
-				}
-				continue;
-			}
-
-			/*
-			 * At least we think this is still a live table. Print its access
-			 * stats.
-			 */
 			fputc('T', fpout);
 			fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
 		}
@@ -2391,134 +2539,66 @@ pgstat_write_statsfile(void)
 		fputc('d', fpout);
 	}
 
-	/*
-	 * Write out the known running backends to the stats file.
-	 */
-	i = MaxBackends;
-	fputc('M', fpout);
-	fwrite(&i, sizeof(i), 1, fpout);
-
-	for (i = 0; i < MaxBackends; i++)
-	{
-		if (pgStatBeTable[i].procpid > 0)
-		{
-			fputc('B', fpout);
-			fwrite(&pgStatBeTable[i], sizeof(PgStat_StatBeEntry), 1, fpout);
-		}
-	}
-
 	/*
 	 * No more output to be done. Close the temp file and replace the old
-	 * pgstat.stat with it.
+	 * pgstat.stat with it.  The ferror() check replaces testing for error
+	 * after each individual fputc or fwrite above.
 	 */
 	fputc('E', fpout);
-	if (fclose(fpout) < 0)
+
+	if (ferror(fpout))
 	{
 		ereport(LOG,
 				(errcode_for_file_access(),
-			   errmsg("could not close temporary statistics file \"%s\": %m",
+			   errmsg("could not write temporary statistics file \"%s\": %m",
 					  PGSTAT_STAT_TMPFILE)));
+		fclose(fpout);
+		unlink(PGSTAT_STAT_TMPFILE);
 	}
-	else
+	else if (fclose(fpout) < 0)
 	{
-		if (rename(PGSTAT_STAT_TMPFILE, PGSTAT_STAT_FILENAME) < 0)
-		{
-			ereport(LOG,
-					(errcode_for_file_access(),
-					 errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
-							PGSTAT_STAT_TMPFILE, PGSTAT_STAT_FILENAME)));
-		}
+		ereport(LOG,
+				(errcode_for_file_access(),
+			   errmsg("could not close temporary statistics file \"%s\": %m",
+					  PGSTAT_STAT_TMPFILE)));
+		unlink(PGSTAT_STAT_TMPFILE);
 	}
-
-	/*
-	 * Clear out the dead backends table
-	 */
-	hash_seq_init(&hstat, pgStatBeDead);
-	while ((deadbe = (PgStat_StatBeDead *) hash_seq_search(&hstat)) != NULL)
+	else if (rename(PGSTAT_STAT_TMPFILE, PGSTAT_STAT_FILENAME) < 0)
 	{
-		/*
-		 * Count down the destroy delay and remove entries where it reaches 0.
-		 */
-		if (--(deadbe->destroy) <= 0)
-		{
-			if (hash_search(pgStatBeDead,
-							(void *) &(deadbe->procpid),
-							HASH_REMOVE, NULL) == NULL)
-			{
-				ereport(ERROR,
-						(errmsg("dead-server-process hash table corrupted "
-								"during cleanup --- abort")));
-			}
-		}
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
+						PGSTAT_STAT_TMPFILE, PGSTAT_STAT_FILENAME)));
+		unlink(PGSTAT_STAT_TMPFILE);
 	}
 }
 
-/*
- * qsort/bsearch comparison routine for PIDs
- *
- * We assume PIDs are nonnegative, so there's no overflow risk
- */
-static int
-comparePids(const void *v1, const void *v2)
-{
-	return *((const int *) v1) - *((const int *) v2);
-}
 
 /* ----------
  * pgstat_read_statsfile() -
  *
- *	Reads in an existing statistics collector and initializes the
- *	databases' hash table (whose entries point to the tables' hash tables)
- *	and the current backend table.
+ *	Reads in an existing statistics collector file and initializes the
+ *	databases' hash table (whose entries point to the tables' hash tables).
  * ----------
  */
-static void
-pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
-					  PgStat_StatBeEntry **betab, int *numbackends)
+static HTAB *
+pgstat_read_statsfile(Oid onlydb)
 {
 	PgStat_StatDBEntry *dbentry;
 	PgStat_StatDBEntry dbbuf;
 	PgStat_StatTabEntry *tabentry;
 	PgStat_StatTabEntry tabbuf;
-	PgStat_StatBeEntry *beentry;
 	HASHCTL		hash_ctl;
+	HTAB	   *dbhash;
 	HTAB	   *tabhash = NULL;
 	FILE	   *fpin;
 	int32		format_id;
-	int			maxbackends = 0;
-	int			havebackends = 0;
 	bool		found;
-	int		   *live_pids;
-	MemoryContext use_mcxt;
-	int			mcxt_flags;
 
 	/*
-	 * If running in the collector or the autovacuum process, we use the
-	 * DynaHashCxt memory context.	If running in a backend, we use the
-	 * TopTransactionContext instead, so the caller must only know the last
-	 * XactId when this call happened to know if his tables are still valid or
-	 * already gone!
-	 *
-	 * Also, if running in a regular backend, we check backend entries against
-	 * the PGPROC array so that we can detect stale entries.  This lets us
-	 * discard entries whose BETERM message got lost for some reason.
+	 * The tables will live in pgStatLocalContext.
 	 */
-	if (pgStatRunningInCollector || IsAutoVacuumProcess())
-	{
-		use_mcxt = NULL;
-		mcxt_flags = 0;
-		live_pids = NULL;
-	}
-	else
-	{
-		use_mcxt = TopTransactionContext;
-		mcxt_flags = HASH_CONTEXT;
-		live_pids = GetAllBackendPids();
-		/* Sort the PID array so we can use bsearch */
-		if (live_pids[0] > 1)
-			qsort((void *) &live_pids[1], live_pids[0], sizeof(int),
-				  comparePids);
-	}
+	pgstat_setup_memcxt();
 
 	/*
 	 * Create the DB hashtable
@@ -2527,18 +2607,15 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 	hash_ctl.keysize = sizeof(Oid);
 	hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
 	hash_ctl.hash = oid_hash;
-	hash_ctl.hcxt = use_mcxt;
-	*dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
-						  HASH_ELEM | HASH_FUNCTION | mcxt_flags);
+	hash_ctl.hcxt = pgStatLocalContext;
+	dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
+						 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 
 	/*
-	 * Initialize the number of known backends to zero, just in case we do a
-	 * silent error return below.
+	 * Clear out global statistics so they start from zero in case we can't
+	 * load an existing statsfile.
 	 */
-	if (numbackends != NULL)
-		*numbackends = 0;
-	if (betab != NULL)
-		*betab = NULL;
+	memset(&globalStats, 0, sizeof(globalStats));
 
 	/*
 	 * Try to open the status file. If it doesn't exist, the backends simply
@@ -2546,7 +2623,7 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 	 * with empty counters.
 	 */
 	if ((fpin = AllocateFile(PGSTAT_STAT_FILENAME, PG_BINARY_R)) == NULL)
-		return;
+		return dbhash;
 
 	/*
 	 * Verify it's of the expected format.
@@ -2559,6 +2636,16 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 		goto done;
 	}
 
+	/*
+	 * Read global stats struct
+	 */
+	if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
+	{
+		ereport(pgStatRunningInCollector ? LOG : WARNING,
+				(errmsg("corrupted pgstat.stat file")));
+		goto done;
+	}
+
 	/*
 	 * We found an existing collector stats file. Read it and put all the
 	 * hashtable entries into place.
@@ -2573,7 +2660,8 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 				 * until a 'd' is encountered.
 				 */
 			case 'D':
-				if (fread(&dbbuf, 1, sizeof(dbbuf), fpin) != sizeof(dbbuf))
+				if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables),
+						  fpin) != offsetof(PgStat_StatDBEntry, tables))
 				{
 					ereport(pgStatRunningInCollector ? LOG : WARNING,
 							(errmsg("corrupted pgstat.stat file")));
@@ -2583,7 +2671,7 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 				/*
 				 * Add to the DB hash
 				 */
-				dbentry = (PgStat_StatDBEntry *) hash_search(*dbhash,
+				dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
 												  (void *) &dbbuf.databaseid,
 															 HASH_ENTER,
 															 &found);
@@ -2596,8 +2684,6 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 
 				memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
 				dbentry->tables = NULL;
-				dbentry->destroy = 0;
-				dbentry->n_backends = 0;
 
 				/*
 				 * Don't collect tables if not the requested DB (or the
@@ -2614,14 +2700,14 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 				hash_ctl.keysize = sizeof(Oid);
 				hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
 				hash_ctl.hash = oid_hash;
-				hash_ctl.hcxt = use_mcxt;
+				hash_ctl.hcxt = pgStatLocalContext;
 				dbentry->tables = hash_create("Per-database table",
 											  PGSTAT_TAB_HASH_SIZE,
 											  &hash_ctl,
-									 HASH_ELEM | HASH_FUNCTION | mcxt_flags);
+									 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 
 				/*
-				 * Arrange that following 'T's add entries to this databases
+				 * Arrange that following 'T's add entries to this database's
 				 * tables hash table.
 				 */
 				tabhash = dbentry->tables;
@@ -2638,7 +2724,8 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 				 * 'T'	A PgStat_StatTabEntry follows.
 				 */
 			case 'T':
-				if (fread(&tabbuf, 1, sizeof(tabbuf), fpin) != sizeof(tabbuf))
+				if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry),
+						  fpin) != sizeof(PgStat_StatTabEntry))
 				{
 					ereport(pgStatRunningInCollector ? LOG : WARNING,
 							(errmsg("corrupted pgstat.stat file")));
@@ -2660,100 +2747,9 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 					ereport(pgStatRunningInCollector ? LOG : WARNING,
 							(errmsg("corrupted pgstat.stat file")));
 					goto done;
-				}
-
-				memcpy(tabentry, &tabbuf, sizeof(tabbuf));
-				break;
-
-				/*
-				 * 'M'	The maximum number of backends to expect follows.
-				 */
-			case 'M':
-				if (betab == NULL || numbackends == NULL)
-					goto done;
-				if (fread(&maxbackends, 1, sizeof(maxbackends), fpin) !=
-					sizeof(maxbackends))
-				{
-					ereport(pgStatRunningInCollector ? LOG : WARNING,
-							(errmsg("corrupted pgstat.stat file")));
-					goto done;
-				}
-				if (maxbackends == 0)
-					goto done;
-
-				/*
-				 * Allocate space (in TopTransactionContext too) for the
-				 * backend table.
-				 */
-				if (use_mcxt == NULL)
-					*betab = (PgStat_StatBeEntry *)
-						palloc(sizeof(PgStat_StatBeEntry) * maxbackends);
-				else
-					*betab = (PgStat_StatBeEntry *)
-						MemoryContextAlloc(use_mcxt,
-								   sizeof(PgStat_StatBeEntry) * maxbackends);
-				break;
-
-				/*
-				 * 'B'	A PgStat_StatBeEntry follows.
-				 */
-			case 'B':
-				if (betab == NULL || numbackends == NULL || *betab == NULL)
-					goto done;
-
-				if (havebackends >= maxbackends)
-					goto done;
-
-				/*
-				 * Read it directly into the table.
-				 */
-				beentry = &(*betab)[havebackends];
-
-				if (fread(beentry, 1, sizeof(PgStat_StatBeEntry), fpin) !=
-					sizeof(PgStat_StatBeEntry))
-				{
-					ereport(pgStatRunningInCollector ? LOG : WARNING,
-							(errmsg("corrupted pgstat.stat file")));
-					goto done;
-				}
-
-				/*
-				 * If possible, check PID to verify still running
-				 */
-				if (live_pids &&
-					(live_pids[0] == 0 ||
-					 bsearch((void *) &beentry->procpid,
-							 (void *) &live_pids[1],
-							 live_pids[0],
-							 sizeof(int),
-							 comparePids) == NULL))
-				{
-					/*
-					 * Note: we could send a BETERM message to tell the
-					 * collector to drop the entry, but I'm a bit worried
-					 * about race conditions.  For now, just silently ignore
-					 * dead entries; they'll get recycled eventually anyway.
-					 */
-
-					/* Don't accept the entry */
-					memset(beentry, 0, sizeof(PgStat_StatBeEntry));
-					break;
-				}
-
-				/*
-				 * Count backends per database here.
-				 */
-				dbentry = (PgStat_StatDBEntry *)
-					hash_search(*dbhash,
-								&(beentry->databaseid),
-								HASH_FIND,
-								NULL);
-				if (dbentry)
-					dbentry->n_backends++;
-
-				havebackends++;
-				*numbackends = havebackends;
+				}
 
+				memcpy(tabentry, &tabbuf, sizeof(tabbuf));
 				break;
 
 				/*
@@ -2771,253 +2767,71 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 
 done:
 	FreeFile(fpin);
+
+	return dbhash;
 }
 
 /*
- * If not done for this transaction, read the statistics collector
- * stats file into some hash tables.
- *
- * Because we store the hash tables in TopTransactionContext, the result
- * is good for the entire current main transaction.
- *
- * Inside the autovacuum process, the statfile is assumed to be valid
- * "forever", that is one iteration, within one database.  This means
- * we only consider the statistics as they were when the autovacuum
- * iteration started.
+ * If not already done, read the statistics collector stats file into
+ * some hash tables.  The results will be kept until pgstat_clear_snapshot()
+ * is called (typically, at end of transaction).
  */
 static void
 backend_read_statsfile(void)
 {
-	if (IsAutoVacuumProcess())
-	{
-		/* already read it? */
-		if (pgStatDBHash)
-			return;
-		Assert(!pgStatRunningInCollector);
-		pgstat_read_statsfile(&pgStatDBHash, InvalidOid,
-							  &pgStatBeTable, &pgStatNumBackends);
-	}
-	else
-	{
-		TransactionId topXid = GetTopTransactionId();
-
-		if (!TransactionIdEquals(pgStatDBHashXact, topXid))
-		{
-			Assert(!pgStatRunningInCollector);
-			pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
-								  &pgStatBeTable, &pgStatNumBackends);
-			pgStatDBHashXact = topXid;
-		}
-	}
-}
-
-
-/* ----------
- * pgstat_recv_bestart() -
- *
- *	Process a backend startup message.
- * ----------
- */
-static void
-pgstat_recv_bestart(PgStat_MsgBestart *msg, int len)
-{
-	PgStat_StatBeEntry *entry;
-
-	/*
-	 * If the backend is known dead, we ignore the message -- we don't want to
-	 * update the backend entry's state since this BESTART message refers to
-	 * an old, dead backend
-	 */
-	if (pgstat_add_backend(&msg->m_hdr) != 0)
+	/* already read it? */
+	if (pgStatDBHash)
 		return;
+	Assert(!pgStatRunningInCollector);
 
-	entry = &(pgStatBeTable[msg->m_hdr.m_backendid - 1]);
-	entry->userid = msg->m_userid;
-	memcpy(&entry->clientaddr, &msg->m_clientaddr, sizeof(entry->clientaddr));
-	entry->databaseid = msg->m_databaseid;
+	/* Autovacuum launcher wants stats about all databases */
+	if (IsAutoVacuumLauncherProcess())
+		pgStatDBHash = pgstat_read_statsfile(InvalidOid);
+	else
+		pgStatDBHash = pgstat_read_statsfile(MyDatabaseId);
 }
 
 
 /* ----------
- * pgstat_recv_beterm() -
- *
- *	Process a backend termination message.
- * ----------
- */
-static void
-pgstat_recv_beterm(PgStat_MsgBeterm *msg, int len)
-{
-	pgstat_sub_backend(msg->m_hdr.m_procpid);
-}
-
-/* ----------
- * pgstat_recv_autovac() -
+ * pgstat_setup_memcxt() -
  *
- *	Process an autovacuum signalling message.
+ *	Create pgStatLocalContext, if not already done.
  * ----------
  */
 static void
-pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
+pgstat_setup_memcxt(void)
 {
-	PgStat_StatDBEntry *dbentry;
-
-	/*
-	 * Lookup the database in the hashtable.  Don't create the entry if it
-	 * doesn't exist, because autovacuum may be processing a template
-	 * database.  If this isn't the case, the database is most likely to have
-	 * an entry already.  (If it doesn't, not much harm is done anyway --
-	 * it'll get created as soon as somebody actually uses the database.)
-	 */
-	dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-	if (dbentry == NULL)
-		return;
-
-	/*
-	 * Store the last autovacuum time in the database entry.
-	 */
-	dbentry->last_autovac_time = msg->m_start_time;
+	if (!pgStatLocalContext)
+		pgStatLocalContext = AllocSetContextCreate(TopMemoryContext,
+												   "Statistics snapshot",
+												   ALLOCSET_SMALL_MINSIZE,
+												   ALLOCSET_SMALL_INITSIZE,
+												   ALLOCSET_SMALL_MAXSIZE);
 }
 
-/* ----------
- * pgstat_recv_vacuum() -
- *
- *	Process a VACUUM message.
- * ----------
- */
-static void
-pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
-{
-	PgStat_StatDBEntry *dbentry;
-	PgStat_StatTabEntry *tabentry;
-	bool		found;
-	bool		create;
-
-	/*
-	 * If we don't know about the database, ignore the message, because it may
-	 * be autovacuum processing a template database.  But if the message is
-	 * for database InvalidOid, don't ignore it, because we are getting a
-	 * message from vacuuming a shared relation.
-	 */
-	create = (msg->m_databaseid == InvalidOid);
-
-	dbentry = pgstat_get_db_entry(msg->m_databaseid, create);
-	if (dbentry == NULL)
-		return;
-
-	tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
-						   HASH_ENTER, &found);
-
-	/*
-	 * If we are creating the entry, initialize it.
-	 */
-	if (!found)
-	{
-		tabentry->numscans = 0;
-
-		tabentry->tuples_returned = 0;
-		tabentry->tuples_fetched = 0;
-		tabentry->tuples_inserted = 0;
-		tabentry->tuples_updated = 0;
-		tabentry->tuples_deleted = 0;
-
-		tabentry->n_live_tuples = msg->m_tuples;
-		tabentry->n_dead_tuples = 0;
-
-		if (msg->m_analyze)
-			tabentry->last_anl_tuples = msg->m_tuples;
-		else
-			tabentry->last_anl_tuples = 0;
-
-		tabentry->blocks_fetched = 0;
-		tabentry->blocks_hit = 0;
-
-		tabentry->destroy = 0;
-	}
-	else
-	{
-		tabentry->n_live_tuples = msg->m_tuples;
-		tabentry->n_dead_tuples = 0;
-		if (msg->m_analyze)
-			tabentry->last_anl_tuples = msg->m_tuples;
-	}
-}
 
 /* ----------
- * pgstat_recv_analyze() -
+ * pgstat_clear_snapshot() -
  *
- *	Process an ANALYZE message.
- * ----------
- */
-static void
-pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
-{
-	PgStat_StatDBEntry *dbentry;
-	PgStat_StatTabEntry *tabentry;
-	bool		found;
-
-	/*
-	 * Note that we do create the database entry here, as opposed to what we
-	 * do on AutovacStart and Vacuum messages.	This is because autovacuum
-	 * never executes ANALYZE on template databases.
-	 */
-	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
-	tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
-						   HASH_ENTER, &found);
-
-	/*
-	 * If we are creating the entry, initialize it.
-	 */
-	if (!found)
-	{
-		tabentry->numscans = 0;
-
-		tabentry->tuples_returned = 0;
-		tabentry->tuples_fetched = 0;
-		tabentry->tuples_inserted = 0;
-		tabentry->tuples_updated = 0;
-		tabentry->tuples_deleted = 0;
-
-		tabentry->n_live_tuples = msg->m_live_tuples;
-		tabentry->n_dead_tuples = msg->m_dead_tuples;
-		tabentry->last_anl_tuples = msg->m_live_tuples + msg->m_dead_tuples;
-
-		tabentry->blocks_fetched = 0;
-		tabentry->blocks_hit = 0;
-
-		tabentry->destroy = 0;
-	}
-	else
-	{
-		tabentry->n_live_tuples = msg->m_live_tuples;
-		tabentry->n_dead_tuples = msg->m_dead_tuples;
-		tabentry->last_anl_tuples = msg->m_live_tuples + msg->m_dead_tuples;
-	}
-}
-
-/* ----------
- * pgstat_recv_activity() -
+ *	Discard any data collected in the current transaction.  Any subsequent
+ *	request will cause new snapshots to be read.
  *
- *	Remember what the backend is doing.
+ *	This is also invoked during transaction commit or abort to discard
+ *	the no-longer-wanted snapshot.
  * ----------
  */
-static void
-pgstat_recv_activity(PgStat_MsgActivity *msg, int len)
+void
+pgstat_clear_snapshot(void)
 {
-	PgStat_StatBeEntry *entry;
-
-	/*
-	 * Here we check explicitly for 0 return, since we don't want to mangle
-	 * the activity of an active backend by a delayed packet from a dead one.
-	 */
-	if (pgstat_add_backend(&msg->m_hdr) != 0)
-		return;
-
-	entry = &(pgStatBeTable[msg->m_hdr.m_backendid - 1]);
-
-	StrNCpy(entry->activity, msg->m_cmd_str, PGSTAT_ACTIVITY_SIZE);
-
-	entry->activity_start_timestamp = GetCurrentTimestamp();
+	/* Release memory, if any was allocated */
+	if (pgStatLocalContext)
+		MemoryContextDelete(pgStatLocalContext);
+
+	/* Reset variables */
+	pgStatLocalContext = NULL;
+	pgStatDBHash = NULL;
+	localBackendStatusTable = NULL;
+	localNumBackends = 0;
 }
 
 
@@ -3036,21 +2850,11 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 	int			i;
 	bool		found;
 
-	/*
-	 * Make sure the backend is counted for.
-	 */
-	if (pgstat_add_backend(&msg->m_hdr) < 0)
-		return;
-
 	dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
 	/*
-	 * If the database is marked for destroy, this is a delayed UDP packet and
-	 * not worth being counted.
+	 * Update database-wide stats.
 	 */
-	if (dbentry->destroy > 0)
-		return;
-
 	dbentry->n_xact_commit += (PgStat_Counter) (msg->m_xact_commit);
 	dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback);
 
@@ -3069,48 +2873,50 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 			 * If it's a new table entry, initialize counters to the values we
 			 * just got.
 			 */
-			tabentry->numscans = tabmsg[i].t_numscans;
-			tabentry->tuples_returned = tabmsg[i].t_tuples_returned;
-			tabentry->tuples_fetched = tabmsg[i].t_tuples_fetched;
-			tabentry->tuples_inserted = tabmsg[i].t_tuples_inserted;
-			tabentry->tuples_updated = tabmsg[i].t_tuples_updated;
-			tabentry->tuples_deleted = tabmsg[i].t_tuples_deleted;
-
-			tabentry->n_live_tuples = tabmsg[i].t_tuples_inserted;
-			tabentry->n_dead_tuples = tabmsg[i].t_tuples_updated +
-				tabmsg[i].t_tuples_deleted;
-			tabentry->last_anl_tuples = 0;
-
-			tabentry->blocks_fetched = tabmsg[i].t_blocks_fetched;
-			tabentry->blocks_hit = tabmsg[i].t_blocks_hit;
+			tabentry->numscans = tabmsg[i].t_counts.t_numscans;
+			tabentry->tuples_returned = tabmsg[i].t_counts.t_tuples_returned;
+			tabentry->tuples_fetched = tabmsg[i].t_counts.t_tuples_fetched;
+			tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted;
+			tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated;
+			tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted;
+			tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples;
+			tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples;
+			tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched;
+			tabentry->blocks_hit = tabmsg[i].t_counts.t_blocks_hit;
 
-			tabentry->destroy = 0;
+			tabentry->last_anl_tuples = 0;
+			tabentry->vacuum_timestamp = 0;
+			tabentry->autovac_vacuum_timestamp = 0;
+			tabentry->analyze_timestamp = 0;
+			tabentry->autovac_analyze_timestamp = 0;
 		}
 		else
 		{
 			/*
 			 * Otherwise add the values to the existing entry.
 			 */
-			tabentry->numscans += tabmsg[i].t_numscans;
-			tabentry->tuples_returned += tabmsg[i].t_tuples_returned;
-			tabentry->tuples_fetched += tabmsg[i].t_tuples_fetched;
-			tabentry->tuples_inserted += tabmsg[i].t_tuples_inserted;
-			tabentry->tuples_updated += tabmsg[i].t_tuples_updated;
-			tabentry->tuples_deleted += tabmsg[i].t_tuples_deleted;
-
-			tabentry->n_live_tuples += tabmsg[i].t_tuples_inserted;
-			tabentry->n_dead_tuples += tabmsg[i].t_tuples_updated +
-				tabmsg[i].t_tuples_deleted;
-
-			tabentry->blocks_fetched += tabmsg[i].t_blocks_fetched;
-			tabentry->blocks_hit += tabmsg[i].t_blocks_hit;
+			tabentry->numscans += tabmsg[i].t_counts.t_numscans;
+			tabentry->tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
+			tabentry->tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
+			tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
+			tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
+			tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+			tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples;
+			tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples;
+			tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
+			tabentry->blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
 		}
 
 		/*
-		 * And add the block IO to the database entry.
+		 * Add per-table stats to the per-database entry, too.
 		 */
-		dbentry->n_blocks_fetched += tabmsg[i].t_blocks_fetched;
-		dbentry->n_blocks_hit += tabmsg[i].t_blocks_hit;
+		dbentry->n_tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
+		dbentry->n_tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
+		dbentry->n_tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
+		dbentry->n_tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
+		dbentry->n_tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+		dbentry->n_blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
+		dbentry->n_blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
 	}
 }
 
@@ -3125,15 +2931,8 @@ static void
 pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
 {
 	PgStat_StatDBEntry *dbentry;
-	PgStat_StatTabEntry *tabentry;
 	int			i;
 
-	/*
-	 * Make sure the backend is counted for.
-	 */
-	if (pgstat_add_backend(&msg->m_hdr) < 0)
-		return;
-
 	dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
 
 	/*
@@ -3142,23 +2941,15 @@ pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
 	if (!dbentry || !dbentry->tables)
 		return;
 
-	/*
-	 * If the database is marked for destroy, this is a delayed UDP packet and
-	 * the tables will go away at DB destruction.
-	 */
-	if (dbentry->destroy > 0)
-		return;
-
 	/*
 	 * Process all table entries in the message.
 	 */
 	for (i = 0; i < msg->m_nentries; i++)
 	{
-		tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-											   (void *) &(msg->m_tableid[i]),
-													   HASH_FIND, NULL);
-		if (tabentry)
-			tabentry->destroy = PGSTAT_DESTROY_COUNT;
+		/* Remove from hashtable if present; we don't care if it's not. */
+		(void) hash_search(dbentry->tables,
+						   (void *) &(msg->m_tableid[i]),
+						   HASH_REMOVE, NULL);
 	}
 }
 
@@ -3174,22 +2965,26 @@ pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
 {
 	PgStat_StatDBEntry *dbentry;
 
-	/*
-	 * Make sure the backend is counted for.
-	 */
-	if (pgstat_add_backend(&msg->m_hdr) < 0)
-		return;
-
 	/*
 	 * Lookup the database in the hashtable.
 	 */
 	dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
 
 	/*
-	 * Mark the database for destruction.
+	 * If found, remove it.
 	 */
 	if (dbentry)
-		dbentry->destroy = PGSTAT_DESTROY_COUNT;
+	{
+		if (dbentry->tables != NULL)
+			hash_destroy(dbentry->tables);
+
+		if (hash_search(pgStatDBHash,
+						(void *) &(dbentry->databaseid),
+						HASH_REMOVE, NULL) == NULL)
+			ereport(ERROR,
+					(errmsg("database hash table corrupted "
+							"during cleanup --- abort")));
+	}
 }
 
 
@@ -3205,12 +3000,6 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
 	HASHCTL		hash_ctl;
 	PgStat_StatDBEntry *dbentry;
 
-	/*
-	 * Make sure the backend is counted for.
-	 */
-	if (pgstat_add_backend(&msg->m_hdr) < 0)
-		return;
-
 	/*
 	 * Lookup the database in the hashtable.  Nothing to do if not there.
 	 */
@@ -3231,7 +3020,6 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
 	dbentry->n_xact_rollback = 0;
 	dbentry->n_blocks_fetched = 0;
 	dbentry->n_blocks_hit = 0;
-	dbentry->destroy = 0;
 
 	memset(&hash_ctl, 0, sizeof(hash_ctl));
 	hash_ctl.keysize = sizeof(Oid);
@@ -3242,3 +3030,133 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
 								  &hash_ctl,
 								  HASH_ELEM | HASH_FUNCTION);
 }
+
+/* ----------
+ * pgstat_recv_autovac() -
+ *
+ *	Process an autovacuum signalling message.
+ * ----------
+ */
+static void
+pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
+{
+	PgStat_StatDBEntry *dbentry;
+
+	/*
+	 * Lookup the database in the hashtable.  Don't create the entry if it
+	 * doesn't exist, because autovacuum may be processing a template
+	 * database.  If this isn't the case, the database is most likely to have
+	 * an entry already.  (If it doesn't, not much harm is done anyway --
+	 * it'll get created as soon as somebody actually uses the database.)
+	 */
+	dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
+	if (dbentry == NULL)
+		return;
+
+	/*
+	 * Store the last autovacuum time in the database entry.
+	 */
+	dbentry->last_autovac_time = msg->m_start_time;
+}
+
+/* ----------
+ * pgstat_recv_vacuum() -
+ *
+ *	Process a VACUUM message.
+ * ----------
+ */
+static void
+pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
+{
+	PgStat_StatDBEntry *dbentry;
+	PgStat_StatTabEntry *tabentry;
+
+	/*
+	 * Don't create either the database or table entry if it doesn't already
+	 * exist.  This avoids bloating the stats with entries for stuff that is
+	 * only touched by vacuum and not by live operations.
+	 */
+	dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
+	if (dbentry == NULL)
+		return;
+
+	tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
+						   HASH_FIND, NULL);
+	if (tabentry == NULL)
+		return;
+
+	if (msg->m_autovacuum)
+		tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime;
+	else
+		tabentry->vacuum_timestamp = msg->m_vacuumtime;
+	tabentry->n_live_tuples = msg->m_tuples;
+	tabentry->n_dead_tuples = 0;
+	if (msg->m_analyze)
+	{
+		tabentry->last_anl_tuples = msg->m_tuples;
+		if (msg->m_autovacuum)
+			tabentry->autovac_analyze_timestamp = msg->m_vacuumtime;
+		else
+			tabentry->analyze_timestamp = msg->m_vacuumtime;
+	}
+	else
+	{
+		/* last_anl_tuples must never exceed n_live_tuples */
+		tabentry->last_anl_tuples = Min(tabentry->last_anl_tuples,
+										msg->m_tuples);
+	}
+}
+
+/* ----------
+ * pgstat_recv_analyze() -
+ *
+ *	Process an ANALYZE message.
+ * ----------
+ */
+static void
+pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
+{
+	PgStat_StatDBEntry *dbentry;
+	PgStat_StatTabEntry *tabentry;
+
+	/*
+	 * Don't create either the database or table entry if it doesn't already
+	 * exist.  This avoids bloating the stats with entries for stuff that is
+	 * only touched by analyze and not by live operations.
+	 */
+	dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
+	if (dbentry == NULL)
+		return;
+
+	tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
+						   HASH_FIND, NULL);
+	if (tabentry == NULL)
+		return;
+
+	if (msg->m_autovacuum)
+		tabentry->autovac_analyze_timestamp = msg->m_analyzetime;
+	else
+		tabentry->analyze_timestamp = msg->m_analyzetime;
+	tabentry->n_live_tuples = msg->m_live_tuples;
+	tabentry->n_dead_tuples = msg->m_dead_tuples;
+	tabentry->last_anl_tuples = msg->m_live_tuples + msg->m_dead_tuples;
+}
+
+
+/* ----------
+ * pgstat_recv_bgwriter() -
+ *
+ *	Process a BGWRITER message.
+ * ----------
+ */
+static void
+pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
+{
+	globalStats.timed_checkpoints += msg->m_timed_checkpoints;
+	globalStats.requested_checkpoints += msg->m_requested_checkpoints;
+	globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
+	globalStats.buf_written_lru += msg->m_buf_written_lru;
+	globalStats.buf_written_all += msg->m_buf_written_all;
+	globalStats.maxwritten_lru += msg->m_maxwritten_lru;
+	globalStats.maxwritten_all += msg->m_maxwritten_all;
+}