]> granicus.if.org Git - postgresql/blobdiff - src/backend/postmaster/pgstat.c
Fix VACUUM so that it always updates pg_class.reltuples/relpages.
[postgresql] / src / backend / postmaster / pgstat.c
index a340a91fed288fd82aa906d50805f223d353ca7c..1d80c311d879d9cf9009621860cda3ab19c6dea9 100644 (file)
@@ -11,9 +11,9 @@
  *                     - Add a pgstat config column to pg_database, so this
  *                       entire thing can be enabled/disabled on a per db basis.
  *
- *     Copyright (c) 2001-2006, PostgreSQL Global Development Group
+ *     Copyright (c) 2001-2011, PostgreSQL Global Development Group
  *
- *     $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.130 2006/06/20 22:52:00 tgl Exp $
+ *     src/backend/postmaster/pgstat.c
  * ----------
  */
 #include "postgres.h"
 #include <arpa/inet.h>
 #include <signal.h>
 #include <time.h>
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
+#ifdef HAVE_SYS_POLL_H
+#include <sys/poll.h>
+#endif
 
 #include "pgstat.h"
 
 #include "access/heapam.h"
+#include "access/transam.h"
+#include "access/twophase_rmgr.h"
 #include "access/xact.h"
 #include "catalog/pg_database.h"
+#include "catalog/pg_proc.h"
+#include "libpq/ip.h"
 #include "libpq/libpq.h"
 #include "libpq/pqsignal.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
+#include "pg_trace.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/fork_process.h"
 #include "postmaster/postmaster.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
-#include "storage/procarray.h"
-#include "tcop/tcopprot.h"
-#include "utils/hsearch.h"
+#include "storage/procsignal.h"
+#include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
 #include "utils/rel.h"
-#include "utils/syscache.h"
+#include "utils/tqual.h"
 
 
 /* ----------
  * Paths for the statistics files (relative to installation's $PGDATA).
  * ----------
  */
-#define PGSTAT_STAT_FILENAME   "global/pgstat.stat"
-#define PGSTAT_STAT_TMPFILE            "global/pgstat.tmp"
+#define PGSTAT_STAT_PERMANENT_FILENAME         "global/pgstat.stat"
+#define PGSTAT_STAT_PERMANENT_TMPFILE          "global/pgstat.tmp"
 
 /* ----------
  * Timer definitions.
  * ----------
  */
-#define PGSTAT_STAT_INTERVAL   500             /* How often to write the status file;
-                                                                                * in milliseconds. */
+#define PGSTAT_STAT_INTERVAL   500             /* Minimum time between stats file
+                                                                                * updates; in milliseconds. */
+
+#define PGSTAT_RETRY_DELAY             10              /* How long to wait between statistics
+                                                                                * update requests; in milliseconds. */
+
+#define PGSTAT_MAX_WAIT_TIME   5000    /* Maximum time to wait for a stats
+                                                                                * file update; in milliseconds. */
 
 #define PGSTAT_RESTART_INTERVAL 60             /* How often to attempt to restart a
                                                                                 * failed statistics collector; in
                                                                                 * seconds. */
 
-/* ----------
- * Amount of space reserved in pgstat_recvbuffer().
- * ----------
- */
-#define PGSTAT_RECVBUFFERSZ            ((int) (1024 * sizeof(PgStat_Msg)))
+#define PGSTAT_SELECT_TIMEOUT  2               /* How often to check for postmaster
+                                                                                * death; in seconds. */
+
+#define PGSTAT_POLL_LOOP_COUNT (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY)
+
 
 /* ----------
  * The initial size hints for the hash tables used in the collector.
  */
 #define PGSTAT_DB_HASH_SIZE            16
 #define PGSTAT_TAB_HASH_SIZE   512
+#define PGSTAT_FUNCTION_HASH_SIZE      512
 
 
 /* ----------
  * GUC parameters
  * ----------
  */
-bool           pgstat_collect_startcollector = true;
-bool           pgstat_collect_resetonpmstart = false;
-bool           pgstat_collect_tuplelevel = false;
-bool           pgstat_collect_blocklevel = false;
-bool           pgstat_collect_querystring = false;
+bool           pgstat_track_activities = false;
+bool           pgstat_track_counts = false;
+int                    pgstat_track_functions = TRACK_FUNC_OFF;
+int                    pgstat_track_activity_query_size = 1024;
+
+/* ----------
+ * Built from GUC parameter
+ * ----------
+ */
+char      *pgstat_stat_filename = NULL;
+char      *pgstat_stat_tmpname = NULL;
+
+/*
+ * BgWriter global statistics counters (unused in other processes).
+ * Stored directly in a stats message structure so it can be sent
+ * without needing to copy things around.  We assume this inits to zeroes.
+ */
+PgStat_MsgBgWriter BgWriterStats;
 
 /* ----------
  * Local data
  * ----------
  */
-NON_EXEC_STATIC int pgStatSock = -1;
-NON_EXEC_STATIC int pgStatPipe[2] = {-1, -1};
+NON_EXEC_STATIC pgsocket pgStatSock = PGINVALID_SOCKET;
 
 static struct sockaddr_storage pgStatAddr;
 
-static pid_t pgStatCollectorPid = 0;
-
 static time_t last_pgstat_start_time;
 
 static bool pgStatRunningInCollector = false;
 
 /*
- * Place where backends store per-table info to be sent to the collector.
- * We store shared relations separately from non-shared ones, to be able to
- * send them in separate messages.
+ * Structures in which backends store per-table info that's waiting to be
+ * sent to the collector.
+ *
+ * NOTE: once allocated, TabStatusArray structures are never moved or deleted
+ * for the life of the backend.  Also, we zero out the t_id fields of the
+ * contained PgStat_TableStatus structs whenever they are not actively in use.
+ * This allows relcache pgstat_info pointers to be treated as long-lived data,
+ * avoiding repeated searches in pgstat_initstats() when a relation is
+ * repeatedly opened during a transaction.
  */
-typedef struct TabStatArray
+#define TABSTAT_QUANTUM                100 /* we alloc this many at a time */
+
+typedef struct TabStatusArray
 {
-       int                     tsa_alloc;              /* num allocated */
-       int                     tsa_used;               /* num actually used */
-       PgStat_MsgTabstat **tsa_messages;       /* the array itself */
-} TabStatArray;
+       struct TabStatusArray *tsa_next;        /* link to next array, if any */
+       int                     tsa_used;               /* # entries currently used */
+       PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM];        /* per-table data */
+} TabStatusArray;
+
+static TabStatusArray *pgStatTabList = NULL;
+
+/*
+ * Backends store per-function info that's waiting to be sent to the collector
+ * in this hash table (indexed by function OID).
+ */
+static HTAB *pgStatFunctions = NULL;
+
+/*
+ * Indicates if backend has some function stats that it hasn't yet
+ * sent to the collector.
+ */
+static bool have_function_stats = false;
 
-#define TABSTAT_QUANTUM                4       /* we alloc this many at a time */
+/*
+ * Tuple insertion/deletion counts for an open transaction can't be propagated
+ * into PgStat_TableStatus counters until we know if it is going to commit
+ * or abort.  Hence, we keep these counts in per-subxact structs that live
+ * in TopTransactionContext.  This data structure is designed on the assumption
+ * that subxacts won't usually modify very many tables.
+ */
+typedef struct PgStat_SubXactStatus
+{
+       int                     nest_level;             /* subtransaction nest level */
+       struct PgStat_SubXactStatus *prev;      /* higher-level subxact if any */
+       PgStat_TableXactStatus *first;          /* head of list for this subxact */
+} PgStat_SubXactStatus;
 
-static TabStatArray RegularTabStat = {0, 0, NULL};
-static TabStatArray SharedTabStat = {0, 0, NULL};
+static PgStat_SubXactStatus *pgStatXactStack = NULL;
 
 static int     pgStatXactCommit = 0;
 static int     pgStatXactRollback = 0;
 
-static TransactionId pgStatDBHashXact = InvalidTransactionId;
+/* Record that's written to 2PC state file when pgstat state is persisted */
+typedef struct TwoPhasePgStatRecord
+{
+       PgStat_Counter tuples_inserted;         /* tuples inserted in xact */
+       PgStat_Counter tuples_updated;          /* tuples updated in xact */
+       PgStat_Counter tuples_deleted;          /* tuples deleted in xact */
+       Oid                     t_id;                   /* table's OID */
+       bool            t_shared;               /* is it a shared catalog? */
+} TwoPhasePgStatRecord;
+
+/*
+ * Info about current "snapshot" of stats file
+ */
+static MemoryContext pgStatLocalContext = NULL;
 static HTAB *pgStatDBHash = NULL;
-static TransactionId pgStatLocalStatusXact = InvalidTransactionId;
 static PgBackendStatus *localBackendStatusTable = NULL;
 static int     localNumBackends = 0;
 
-static volatile bool   need_statwrite;
+/*
+ * Cluster wide statistics, kept in the stats collector.
+ * Contains statistics that are not collected per database
+ * or per table.
+ */
+static PgStat_GlobalStats globalStats;
+
+/* Last time the collector successfully wrote the stats file */
+static TimestampTz last_statwrite;
+
+/* Latest statistics request time from backends */
+static TimestampTz last_statrequest;
+
+static volatile bool need_exit = false;
+static volatile bool got_SIGHUP = false;
+
+/*
+ * Total time charged to functions so far in the current backend.
+ * We use this to help separate "self" and "other" time charges.
+ * (We assume this initializes to zero.)
+ */
+static instr_time total_func_time;
 
 
 /* ----------
@@ -146,42 +239,47 @@ static volatile bool      need_statwrite;
  * ----------
  */
 #ifdef EXEC_BACKEND
-
-typedef enum STATS_PROCESS_TYPE
-{
-       STAT_PROC_BUFFER,
-       STAT_PROC_COLLECTOR
-}      STATS_PROCESS_TYPE;
-
-static pid_t pgstat_forkexec(STATS_PROCESS_TYPE procType);
-static void pgstat_parseArgs(int argc, char *argv[]);
+static pid_t pgstat_forkexec(void);
 #endif
 
-NON_EXEC_STATIC void PgstatBufferMain(int argc, char *argv[]);
 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]);
-static void force_statwrite(SIGNAL_ARGS);
-static void pgstat_recvbuffer(void);
 static void pgstat_exit(SIGNAL_ARGS);
-static void pgstat_die(SIGNAL_ARGS);
 static void pgstat_beshutdown_hook(int code, Datum arg);
+static void pgstat_sighup_handler(SIGNAL_ARGS);
 
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
-static void pgstat_drop_database(Oid databaseid);
-static void pgstat_write_statsfile(void);
-static void pgstat_read_statsfile(HTAB **dbhash, Oid onlydb);
+static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
+                                        Oid tableoid, bool create);
+static void pgstat_write_statsfile(bool permanent);
+static HTAB *pgstat_read_statsfile(Oid onlydb, bool permanent);
 static void backend_read_statsfile(void);
 static void pgstat_read_current_status(void);
 
+static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
+static void pgstat_send_funcstats(void);
+static HTAB *pgstat_collect_oids(Oid catalogid);
+
+static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
+
+static void pgstat_setup_memcxt(void);
+
 static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
 static void pgstat_send(void *msg, int len);
 
+static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len);
 static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
 static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
 static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
 static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
+static void pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len);
+static void pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len);
 static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
 static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
+static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
+static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
+static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
+static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
 
 
 /* ------------------------------------------------------------
@@ -211,30 +309,8 @@ pgstat_init(void)
        char            test_byte;
        int                     sel_res;
        int                     tries = 0;
-       
-#define TESTBYTEVAL ((char) 199)
 
-       /*
-        * Force start of collector daemon if something to collect.  Note that
-        * pgstat_collect_querystring is now an independent facility that does
-        * not require the collector daemon.
-        */
-       if (pgstat_collect_tuplelevel ||
-               pgstat_collect_blocklevel)
-               pgstat_collect_startcollector = true;
-
-       /*
-        * If we don't have to start a collector or should reset the collected
-        * statistics on postmaster start, simply remove the stats file.
-        */
-       if (!pgstat_collect_startcollector || pgstat_collect_resetonpmstart)
-               pgstat_reset_all();
-
-       /*
-        * Nothing else required if collector will not get started
-        */
-       if (!pgstat_collect_startcollector)
-               return;
+#define TESTBYTEVAL ((char) 199)
 
        /*
         * Create the UDP socket for sending and receiving statistic messages
@@ -274,12 +350,12 @@ pgstat_init(void)
 
                if (++tries > 1)
                        ereport(LOG,
-                               (errmsg("trying another address for the statistics collector")));
-               
+                       (errmsg("trying another address for the statistics collector")));
+
                /*
                 * Create the socket.
                 */
-               if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) < 0)
+               if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) == PGINVALID_SOCKET)
                {
                        ereport(LOG,
                                        (errcode_for_socket_access(),
@@ -297,7 +373,7 @@ pgstat_init(void)
                                        (errcode_for_socket_access(),
                          errmsg("could not bind socket for statistics collector: %m")));
                        closesocket(pgStatSock);
-                       pgStatSock = -1;
+                       pgStatSock = PGINVALID_SOCKET;
                        continue;
                }
 
@@ -308,7 +384,7 @@ pgstat_init(void)
                                        (errcode_for_socket_access(),
                                         errmsg("could not get address of socket for statistics collector: %m")));
                        closesocket(pgStatSock);
-                       pgStatSock = -1;
+                       pgStatSock = PGINVALID_SOCKET;
                        continue;
                }
 
@@ -324,7 +400,7 @@ pgstat_init(void)
                                        (errcode_for_socket_access(),
                        errmsg("could not connect socket for statistics collector: %m")));
                        closesocket(pgStatSock);
-                       pgStatSock = -1;
+                       pgStatSock = PGINVALID_SOCKET;
                        continue;
                }
 
@@ -335,13 +411,17 @@ pgstat_init(void)
                 * rules prevent it).
                 */
                test_byte = TESTBYTEVAL;
+
+retry1:
                if (send(pgStatSock, &test_byte, 1, 0) != 1)
                {
+                       if (errno == EINTR)
+                               goto retry1;    /* if interrupted, just retry */
                        ereport(LOG,
                                        (errcode_for_socket_access(),
                                         errmsg("could not send test message on socket for statistics collector: %m")));
                        closesocket(pgStatSock);
-                       pgStatSock = -1;
+                       pgStatSock = PGINVALID_SOCKET;
                        continue;
                }
 
@@ -354,6 +434,7 @@ pgstat_init(void)
                {
                        FD_ZERO(&rset);
                        FD_SET(pgStatSock, &rset);
+
                        tv.tv_sec = 0;
                        tv.tv_usec = 500000;
                        sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv);
@@ -366,7 +447,7 @@ pgstat_init(void)
                                        (errcode_for_socket_access(),
                                         errmsg("select() failed in statistics collector: %m")));
                        closesocket(pgStatSock);
-                       pgStatSock = -1;
+                       pgStatSock = PGINVALID_SOCKET;
                        continue;
                }
                if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset))
@@ -381,19 +462,22 @@ pgstat_init(void)
                                        (errcode(ERRCODE_CONNECTION_FAILURE),
                                         errmsg("test message did not get through on socket for statistics collector")));
                        closesocket(pgStatSock);
-                       pgStatSock = -1;
+                       pgStatSock = PGINVALID_SOCKET;
                        continue;
                }
 
                test_byte++;                    /* just make sure variable is changed */
 
+retry2:
                if (recv(pgStatSock, &test_byte, 1, 0) != 1)
                {
+                       if (errno == EINTR)
+                               goto retry2;    /* if interrupted, just retry */
                        ereport(LOG,
                                        (errcode_for_socket_access(),
                                         errmsg("could not receive test message on socket for statistics collector: %m")));
                        closesocket(pgStatSock);
-                       pgStatSock = -1;
+                       pgStatSock = PGINVALID_SOCKET;
                        continue;
                }
 
@@ -403,7 +487,7 @@ pgstat_init(void)
                                        (errcode(ERRCODE_INTERNAL_ERROR),
                                         errmsg("incorrect test message transmission on socket for statistics collector")));
                        closesocket(pgStatSock);
-                       pgStatSock = -1;
+                       pgStatSock = PGINVALID_SOCKET;
                        continue;
                }
 
@@ -412,14 +496,13 @@ pgstat_init(void)
        }
 
        /* Did we find a working address? */
-       if (!addr || pgStatSock < 0)
+       if (!addr || pgStatSock == PGINVALID_SOCKET)
                goto startup_failed;
 
        /*
         * Set the socket to non-blocking IO.  This ensures that if the collector
-        * falls behind (despite the buffering process), statistics messages will
-        * be discarded; backends won't block waiting to send messages to the
-        * collector.
+        * falls behind, statistics messages will be discarded; backends won't
+        * block waiting to send messages to the collector.
         */
        if (!pg_set_noblock(pgStatSock))
        {
@@ -440,27 +523,30 @@ startup_failed:
        if (addrs)
                pg_freeaddrinfo_all(hints.ai_family, addrs);
 
-       if (pgStatSock >= 0)
+       if (pgStatSock != PGINVALID_SOCKET)
                closesocket(pgStatSock);
-       pgStatSock = -1;
+       pgStatSock = PGINVALID_SOCKET;
 
-       /* Adjust GUC variables to suppress useless activity */
-       pgstat_collect_startcollector = false;
-       pgstat_collect_tuplelevel = false;
-       pgstat_collect_blocklevel = false;
+       /*
+        * Adjust GUC variables to suppress useless activity, and for debugging
+        * purposes (seeing track_counts off is a clue that we failed here). We
+        * use PGC_S_OVERRIDE because there is no point in trying to turn it back
+        * on from postgresql.conf without a restart.
+        */
+       SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
 }
 
 /*
  * pgstat_reset_all() -
  *
- * Remove the stats file.  This is used on server start if the
- * stats_reset_on_server_start feature is enabled, or if WAL
+ * Remove the stats file.  This is currently used only if WAL
  * recovery is needed after a crash.
  */
 void
 pgstat_reset_all(void)
 {
-       unlink(PGSTAT_STAT_FILENAME);
+       unlink(pgstat_stat_filename);
+       unlink(PGSTAT_STAT_PERMANENT_FILENAME);
 }
 
 #ifdef EXEC_BACKEND
@@ -468,69 +554,27 @@ pgstat_reset_all(void)
 /*
  * pgstat_forkexec() -
  *
- * Format up the arglist for, then fork and exec, statistics
- * (buffer and collector) processes
+ * Format up the arglist for, then fork and exec, statistics collector process
  */
 static pid_t
-pgstat_forkexec(STATS_PROCESS_TYPE procType)
+pgstat_forkexec(void)
 {
        char       *av[10];
-       int                     ac = 0,
-                               bufc = 0,
-                               i;
-       char            pgstatBuf[2][32];
+       int                     ac = 0;
 
        av[ac++] = "postgres";
-
-       switch (procType)
-       {
-               case STAT_PROC_BUFFER:
-                       av[ac++] = "--forkbuf";
-                       break;
-
-               case STAT_PROC_COLLECTOR:
-                       av[ac++] = "--forkcol";
-                       break;
-
-               default:
-                       Assert(false);
-       }
-
+       av[ac++] = "--forkcol";
        av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
 
-       /* postgres_exec_path is not passed by write_backend_variables */
-       av[ac++] = postgres_exec_path;
-
-       /* Add to the arg list */
-       Assert(bufc <= lengthof(pgstatBuf));
-       for (i = 0; i < bufc; i++)
-               av[ac++] = pgstatBuf[i];
-
        av[ac] = NULL;
        Assert(ac < lengthof(av));
 
        return postmaster_forkexec(ac, av);
 }
-
-
-/*
- * pgstat_parseArgs() -
- *
- * Extract data from the arglist for exec'ed statistics
- * (buffer and collector) processes
- */
-static void
-pgstat_parseArgs(int argc, char *argv[])
-{
-       Assert(argc == 4);
-
-       argc = 3;
-       StrNCpy(postgres_exec_path, argv[argc++], MAXPGPATH);
-}
 #endif   /* EXEC_BACKEND */
 
 
-/* ----------
+/*
  * pgstat_start() -
  *
  *     Called from postmaster at startup or after an existing collector
@@ -539,7 +583,6 @@ pgstat_parseArgs(int argc, char *argv[])
  *     Returns PID of child process, or 0 if fail.
  *
  *     Note: if fail, we will be called again from the postmaster main loop.
- * ----------
  */
 int
 pgstat_start(void)
@@ -548,9 +591,10 @@ pgstat_start(void)
        pid_t           pgStatPid;
 
        /*
-        * Do nothing if no collector needed
+        * Check that the socket is there, else pgstat_init failed and we can do
+        * nothing useful.
         */
-       if (!pgstat_collect_startcollector)
+       if (pgStatSock == PGINVALID_SOCKET)
                return 0;
 
        /*
@@ -565,34 +609,18 @@ pgstat_start(void)
                return 0;
        last_pgstat_start_time = curtime;
 
-       /*
-        * Check that the socket is there, else pgstat_init failed.
-        */
-       if (pgStatSock < 0)
-       {
-               ereport(LOG,
-                               (errmsg("statistics collector startup skipped")));
-
-               /*
-                * We can only get here if someone tries to manually turn
-                * pgstat_collect_startcollector on after it had been off.
-                */
-               pgstat_collect_startcollector = false;
-               return 0;
-       }
-
        /*
         * Okay, fork off the collector.
         */
 #ifdef EXEC_BACKEND
-       switch ((pgStatPid = pgstat_forkexec(STAT_PROC_BUFFER)))
+       switch ((pgStatPid = pgstat_forkexec()))
 #else
        switch ((pgStatPid = fork_process()))
 #endif
        {
                case -1:
                        ereport(LOG,
-                                       (errmsg("could not fork statistics buffer: %m")));
+                                       (errmsg("could not fork statistics collector: %m")));
                        return 0;
 
 #ifndef EXEC_BACKEND
@@ -607,7 +635,7 @@ pgstat_start(void)
                        /* Drop our connection to postmaster's shared memory, as well */
                        PGSharedMemoryDetach();
 
-                       PgstatBufferMain(0, NULL);
+                       PgstatCollectorMain(0, NULL);
                        break;
 #endif
 
@@ -619,6 +647,11 @@ pgstat_start(void)
        return 0;
 }
 
+void
+allow_immediate_pgstat_restart(void)
+{
+       last_pgstat_start_time = 0;
+}
 
 /* ------------------------------------------------------------
  * Public functions used by backends follow
@@ -627,95 +660,219 @@ pgstat_start(void)
 
 
 /* ----------
- * pgstat_report_tabstat() -
+ * pgstat_report_stat() -
  *
- *     Called from tcop/postgres.c to send the so far collected
- *     per table access statistics to the collector.
+ *     Called from tcop/postgres.c to send the so far collected per-table
+ *     and function usage statistics to the collector.  Note that this is
+ *     called only when not within a transaction, so it is fair to use
+ *     transaction stop time as an approximation of current time.
  * ----------
  */
 void
-pgstat_report_tabstat(void)
+pgstat_report_stat(bool force)
 {
+       /* we assume this inits to all zeroes: */
+       static const PgStat_TableCounts all_zeroes;
+       static TimestampTz last_report = 0;
+
+       TimestampTz now;
+       PgStat_MsgTabstat regular_msg;
+       PgStat_MsgTabstat shared_msg;
+       TabStatusArray *tsa;
        int                     i;
 
-       if (pgStatSock < 0 ||
-               (!pgstat_collect_tuplelevel &&
-                !pgstat_collect_blocklevel))
-       {
-               /* Not reporting stats, so just flush whatever we have */
-               RegularTabStat.tsa_used = 0;
-               SharedTabStat.tsa_used = 0;
+       /* Don't expend a clock check if nothing to do */
+       if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0)
+               && !have_function_stats)
                return;
-       }
 
        /*
-        * For each message buffer used during the last query set the header
-        * fields and send it out.
+        * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
+        * msec since we last sent one, or the caller wants to force stats out.
+        */
+       now = GetCurrentTransactionStopTimestamp();
+       if (!force &&
+               !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL))
+               return;
+       last_report = now;
+
+       /*
+        * Scan through the TabStatusArray struct(s) to find tables that actually
+        * have counts, and build messages to send.  We have to separate shared
+        * relations from regular ones because the databaseid field in the message
+        * header has to depend on that.
         */
-       for (i = 0; i < RegularTabStat.tsa_used; i++)
+       regular_msg.m_databaseid = MyDatabaseId;
+       shared_msg.m_databaseid = InvalidOid;
+       regular_msg.m_nentries = 0;
+       shared_msg.m_nentries = 0;
+
+       for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
        {
-               PgStat_MsgTabstat *tsmsg = RegularTabStat.tsa_messages[i];
-               int                     n;
-               int                     len;
+               for (i = 0; i < tsa->tsa_used; i++)
+               {
+                       PgStat_TableStatus *entry = &tsa->tsa_entries[i];
+                       PgStat_MsgTabstat *this_msg;
+                       PgStat_TableEntry *this_ent;
+
+                       /* Shouldn't have any pending transaction-dependent counts */
+                       Assert(entry->trans == NULL);
+
+                       /*
+                        * Ignore entries that didn't accumulate any actual counts, such
+                        * as indexes that were opened by the planner but not used.
+                        */
+                       if (memcmp(&entry->t_counts, &all_zeroes,
+                                          sizeof(PgStat_TableCounts)) == 0)
+                               continue;
+
+                       /*
+                        * OK, insert data into the appropriate message, and send if full.
+                        */
+                       this_msg = entry->t_shared ? &shared_msg : &regular_msg;
+                       this_ent = &this_msg->m_entry[this_msg->m_nentries];
+                       this_ent->t_id = entry->t_id;
+                       memcpy(&this_ent->t_counts, &entry->t_counts,
+                                  sizeof(PgStat_TableCounts));
+                       if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
+                       {
+                               pgstat_send_tabstat(this_msg);
+                               this_msg->m_nentries = 0;
+                       }
+               }
+               /* zero out TableStatus structs after use */
+               MemSet(tsa->tsa_entries, 0,
+                          tsa->tsa_used * sizeof(PgStat_TableStatus));
+               tsa->tsa_used = 0;
+       }
+
+       /*
+        * Send partial messages.  If force is true, make sure that any pending
+        * xact commit/abort gets counted, even if no table stats to send.
+        */
+       if (regular_msg.m_nentries > 0 ||
+               (force && (pgStatXactCommit > 0 || pgStatXactRollback > 0)))
+               pgstat_send_tabstat(&regular_msg);
+       if (shared_msg.m_nentries > 0)
+               pgstat_send_tabstat(&shared_msg);
+
+       /* Now, send function statistics */
+       pgstat_send_funcstats();
+}
 
-               n = tsmsg->m_nentries;
-               len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
-                       n * sizeof(PgStat_TableEntry);
+/*
+ * Subroutine for pgstat_report_stat: finish and send a tabstat message
+ */
+static void
+pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
+{
+       int                     n;
+       int                     len;
+
+       /* It's unlikely we'd get here with no socket, but maybe not impossible */
+       if (pgStatSock == PGINVALID_SOCKET)
+               return;
 
+       /*
+        * Report accumulated xact commit/rollback whenever we send a normal
+        * tabstat message
+        */
+       if (OidIsValid(tsmsg->m_databaseid))
+       {
                tsmsg->m_xact_commit = pgStatXactCommit;
                tsmsg->m_xact_rollback = pgStatXactRollback;
                pgStatXactCommit = 0;
                pgStatXactRollback = 0;
-
-               pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
-               tsmsg->m_databaseid = MyDatabaseId;
-               pgstat_send(tsmsg, len);
        }
-       RegularTabStat.tsa_used = 0;
+       else
+       {
+               tsmsg->m_xact_commit = 0;
+               tsmsg->m_xact_rollback = 0;
+       }
 
-       /* Ditto, for shared relations */
-       for (i = 0; i < SharedTabStat.tsa_used; i++)
+       n = tsmsg->m_nentries;
+       len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
+               n * sizeof(PgStat_TableEntry);
+
+       pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
+       pgstat_send(tsmsg, len);
+}
+
+/*
+ * Subroutine for pgstat_report_stat: populate and send a function stat message
+ */
+static void
+pgstat_send_funcstats(void)
+{
+       /* we assume this inits to all zeroes: */
+       static const PgStat_FunctionCounts all_zeroes;
+
+       PgStat_MsgFuncstat msg;
+       PgStat_BackendFunctionEntry *entry;
+       HASH_SEQ_STATUS fstat;
+
+       if (pgStatFunctions == NULL)
+               return;
+
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FUNCSTAT);
+       msg.m_databaseid = MyDatabaseId;
+       msg.m_nentries = 0;
+
+       hash_seq_init(&fstat, pgStatFunctions);
+       while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL)
        {
-               PgStat_MsgTabstat *tsmsg = SharedTabStat.tsa_messages[i];
-               int                     n;
-               int                     len;
+               PgStat_FunctionEntry *m_ent;
 
-               n = tsmsg->m_nentries;
-               len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
-                       n * sizeof(PgStat_TableEntry);
+               /* Skip it if no counts accumulated since last time */
+               if (memcmp(&entry->f_counts, &all_zeroes,
+                                  sizeof(PgStat_FunctionCounts)) == 0)
+                       continue;
 
-               /* We don't report transaction commit/abort here */
-               tsmsg->m_xact_commit = 0;
-               tsmsg->m_xact_rollback = 0;
+               /* need to convert format of time accumulators */
+               m_ent = &msg.m_entry[msg.m_nentries];
+               m_ent->f_id = entry->f_id;
+               m_ent->f_numcalls = entry->f_counts.f_numcalls;
+               m_ent->f_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_time);
+               m_ent->f_time_self = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_time_self);
+
+               if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES)
+               {
+                       pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
+                                               msg.m_nentries * sizeof(PgStat_FunctionEntry));
+                       msg.m_nentries = 0;
+               }
 
-               pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
-               tsmsg->m_databaseid = InvalidOid;
-               pgstat_send(tsmsg, len);
+               /* reset the entry's counts */
+               MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts));
        }
-       SharedTabStat.tsa_used = 0;
+
+       if (msg.m_nentries > 0)
+               pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
+                                       msg.m_nentries * sizeof(PgStat_FunctionEntry));
+
+       have_function_stats = false;
 }
 
 
 /* ----------
- * pgstat_vacuum_tabstat() -
+ * pgstat_vacuum_stat() -
  *
  *     Will tell the collector about objects he can get rid of.
  * ----------
  */
 void
-pgstat_vacuum_tabstat(void)
+pgstat_vacuum_stat(void)
 {
-       List       *oidlist;
-       Relation        rel;
-       HeapScanDesc scan;
-       HeapTuple       tup;
+       HTAB       *htab;
        PgStat_MsgTabpurge msg;
+       PgStat_MsgFuncpurge f_msg;
        HASH_SEQ_STATUS hstat;
        PgStat_StatDBEntry *dbentry;
        PgStat_StatTabEntry *tabentry;
+       PgStat_StatFuncEntry *funcentry;
        int                     len;
 
-       if (pgStatSock < 0)
+       if (pgStatSock == PGINVALID_SOCKET)
                return;
 
        /*
@@ -727,15 +884,7 @@ pgstat_vacuum_tabstat(void)
        /*
         * Read pg_database and make a list of OIDs of all existing databases
         */
-       oidlist = NIL;
-       rel = heap_open(DatabaseRelationId, AccessShareLock);
-       scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
-       while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
-       {
-               oidlist = lappend_oid(oidlist, HeapTupleGetOid(tup));
-       }
-       heap_endscan(scan);
-       heap_close(rel, AccessShareLock);
+       htab = pgstat_collect_oids(DatabaseRelationId);
 
        /*
         * Search the database hash table for dead databases and tell the
@@ -746,12 +895,16 @@ pgstat_vacuum_tabstat(void)
        {
                Oid                     dbid = dbentry->databaseid;
 
-               if (!list_member_oid(oidlist, dbid))
+               CHECK_FOR_INTERRUPTS();
+
+               /* the DB entry for shared tables (with InvalidOid) is never dropped */
+               if (OidIsValid(dbid) &&
+                       hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL)
                        pgstat_drop_database(dbid);
        }
 
        /* Clean up */
-       list_free(oidlist);
+       hash_destroy(htab);
 
        /*
         * Lookup our own database entry; if not found, nothing more to do.
@@ -765,15 +918,7 @@ pgstat_vacuum_tabstat(void)
        /*
         * Similarly to above, make a list of all known relations in this DB.
         */
-       oidlist = NIL;
-       rel = heap_open(RelationRelationId, AccessShareLock);
-       scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
-       while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
-       {
-               oidlist = lappend_oid(oidlist, HeapTupleGetOid(tup));
-       }
-       heap_endscan(scan);
-       heap_close(rel, AccessShareLock);
+       htab = pgstat_collect_oids(RelationRelationId);
 
        /*
         * Initialize our messages table counter to zero
@@ -786,13 +931,17 @@ pgstat_vacuum_tabstat(void)
        hash_seq_init(&hstat, dbentry->tables);
        while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
        {
-               if (list_member_oid(oidlist, tabentry->tableid))
+               Oid                     tabid = tabentry->tableid;
+
+               CHECK_FOR_INTERRUPTS();
+
+               if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL)
                        continue;
 
                /*
                 * Not there, so add this table's Oid to the message
                 */
-               msg.m_tableid[msg.m_nentries++] = tabentry->tableid;
+               msg.m_tableid[msg.m_nentries++] = tabid;
 
                /*
                 * If the message is full, send it out and reinitialize to empty
@@ -824,7 +973,108 @@ pgstat_vacuum_tabstat(void)
        }
 
        /* Clean up */
-       list_free(oidlist);
+       hash_destroy(htab);
+
+       /*
+        * Now repeat the above steps for functions.  However, we needn't bother
+        * in the common case where no function stats are being collected.
+        */
+       if (dbentry->functions != NULL &&
+               hash_get_num_entries(dbentry->functions) > 0)
+       {
+               htab = pgstat_collect_oids(ProcedureRelationId);
+
+               pgstat_setheader(&f_msg.m_hdr, PGSTAT_MTYPE_FUNCPURGE);
+               f_msg.m_databaseid = MyDatabaseId;
+               f_msg.m_nentries = 0;
+
+               hash_seq_init(&hstat, dbentry->functions);
+               while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL)
+               {
+                       Oid                     funcid = funcentry->functionid;
+
+                       CHECK_FOR_INTERRUPTS();
+
+                       if (hash_search(htab, (void *) &funcid, HASH_FIND, NULL) != NULL)
+                               continue;
+
+                       /*
+                        * Not there, so add this function's Oid to the message
+                        */
+                       f_msg.m_functionid[f_msg.m_nentries++] = funcid;
+
+                       /*
+                        * If the message is full, send it out and reinitialize to empty
+                        */
+                       if (f_msg.m_nentries >= PGSTAT_NUM_FUNCPURGE)
+                       {
+                               len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
+                                       +f_msg.m_nentries * sizeof(Oid);
+
+                               pgstat_send(&f_msg, len);
+
+                               f_msg.m_nentries = 0;
+                       }
+               }
+
+               /*
+                * Send the rest
+                */
+               if (f_msg.m_nentries > 0)
+               {
+                       len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
+                               +f_msg.m_nentries * sizeof(Oid);
+
+                       pgstat_send(&f_msg, len);
+               }
+
+               hash_destroy(htab);
+       }
+}
+
+
+/* ----------
+ * pgstat_collect_oids() -
+ *
+ *     Collect the OIDs of all objects listed in the specified system catalog
+ *     into a temporary hash table.  Caller should hash_destroy the result
+ *     when done with it.      (However, we make the table in CurrentMemoryContext
+ *     so that it will be freed properly in event of an error.)
+ * ----------
+ */
+static HTAB *
+pgstat_collect_oids(Oid catalogid)
+{
+       HTAB       *htab;
+       HASHCTL         hash_ctl;
+       Relation        rel;
+       HeapScanDesc scan;
+       HeapTuple       tup;
+
+       memset(&hash_ctl, 0, sizeof(hash_ctl));
+       hash_ctl.keysize = sizeof(Oid);
+       hash_ctl.entrysize = sizeof(Oid);
+       hash_ctl.hash = oid_hash;
+       hash_ctl.hcxt = CurrentMemoryContext;
+       htab = hash_create("Temporary table of OIDs",
+                                          PGSTAT_TAB_HASH_SIZE,
+                                          &hash_ctl,
+                                          HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+       rel = heap_open(catalogid, AccessShareLock);
+       scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
+       while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
+       {
+               Oid                     thisoid = HeapTupleGetOid(tup);
+
+               CHECK_FOR_INTERRUPTS();
+
+               (void) hash_search(htab, (void *) &thisoid, HASH_ENTER, NULL);
+       }
+       heap_endscan(scan);
+       heap_close(rel, AccessShareLock);
+
+       return htab;
 }
 
 
@@ -833,15 +1083,15 @@ pgstat_vacuum_tabstat(void)
  *
  *     Tell the collector that we just dropped a database.
  *     (If the message gets lost, we will still clean the dead DB eventually
- *     via future invocations of pgstat_vacuum_tabstat().)
+ *     via future invocations of pgstat_vacuum_stat().)
  * ----------
  */
-static void
+void
 pgstat_drop_database(Oid databaseid)
 {
        PgStat_MsgDropdb msg;
 
-       if (pgStatSock < 0)
+       if (pgStatSock == PGINVALID_SOCKET)
                return;
 
        pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB);
@@ -855,27 +1105,32 @@ pgstat_drop_database(Oid databaseid)
  *
  *     Tell the collector that we just dropped a relation.
  *     (If the message gets lost, we will still clean the dead entry eventually
- *     via future invocations of pgstat_vacuum_tabstat().)
+ *     via future invocations of pgstat_vacuum_stat().)
+ *
+ *     Currently not used for lack of any good place to call it; we rely
+ *     entirely on pgstat_vacuum_stat() to clean out stats for dead rels.
  * ----------
  */
+#ifdef NOT_USED
 void
 pgstat_drop_relation(Oid relid)
 {
        PgStat_MsgTabpurge msg;
        int                     len;
 
-       if (pgStatSock < 0)
+       if (pgStatSock == PGINVALID_SOCKET)
                return;
 
        msg.m_tableid[0] = relid;
        msg.m_nentries = 1;
 
-       len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) + sizeof(Oid);
+       len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) +sizeof(Oid);
 
        pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
        msg.m_databaseid = MyDatabaseId;
        pgstat_send(&msg, len);
 }
+#endif   /* NOT_USED */
 
 
 /* ----------
@@ -889,7 +1144,7 @@ pgstat_reset_counters(void)
 {
        PgStat_MsgResetcounter msg;
 
-       if (pgStatSock < 0)
+       if (pgStatSock == PGINVALID_SOCKET)
                return;
 
        if (!superuser())
@@ -902,27 +1157,84 @@ pgstat_reset_counters(void)
        pgstat_send(&msg, sizeof(msg));
 }
 
-
 /* ----------
- * pgstat_report_autovac() -
+ * pgstat_reset_shared_counters() -
  *
- *     Called from autovacuum.c to report startup of an autovacuum process.
- *     We are called before InitPostgres is done, so can't rely on MyDatabaseId;
- *     the db OID must be passed in, instead.
+ *     Tell the statistics collector to reset cluster-wide shared counters.
  * ----------
  */
 void
-pgstat_report_autovac(Oid dboid)
+pgstat_reset_shared_counters(const char *target)
 {
-       PgStat_MsgAutovacStart msg;
+       PgStat_MsgResetsharedcounter msg;
 
-       if (pgStatSock < 0)
+       if (pgStatSock == PGINVALID_SOCKET)
                return;
 
-       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
-       msg.m_databaseid = dboid;
-       msg.m_start_time = GetCurrentTimestamp();
-
+       if (!superuser())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+                                errmsg("must be superuser to reset statistics counters")));
+
+       if (strcmp(target, "bgwriter") == 0)
+               msg.m_resettarget = RESET_BGWRITER;
+       else
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                errmsg("unrecognized reset target: \"%s\"", target),
+                                errhint("Target must be \"bgwriter\".")));
+
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER);
+       pgstat_send(&msg, sizeof(msg));
+}
+
+/* ----------
+ * pgstat_reset_single_counter() -
+ *
+ *     Tell the statistics collector to reset a single counter.
+ * ----------
+ */
+void
+pgstat_reset_single_counter(Oid objoid, PgStat_Single_Reset_Type type)
+{
+       PgStat_MsgResetsinglecounter msg;
+
+       if (pgStatSock == PGINVALID_SOCKET)
+               return;
+
+       if (!superuser())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+                                errmsg("must be superuser to reset statistics counters")));
+
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSINGLECOUNTER);
+       msg.m_databaseid = MyDatabaseId;
+       msg.m_resettype = type;
+       msg.m_objectid = objoid;
+
+       pgstat_send(&msg, sizeof(msg));
+}
+
+/* ----------
+ * pgstat_report_autovac() -
+ *
+ *     Called from autovacuum.c to report startup of an autovacuum process.
+ *     We are called before InitPostgres is done, so can't rely on MyDatabaseId;
+ *     the db OID must be passed in, instead.
+ * ----------
+ */
+void
+pgstat_report_autovac(Oid dboid)
+{
+       PgStat_MsgAutovacStart msg;
+
+       if (pgStatSock == PGINVALID_SOCKET)
+               return;
+
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
+       msg.m_databaseid = dboid;
+       msg.m_start_time = GetCurrentTimestamp();
+
        pgstat_send(&msg, sizeof(msg));
 }
 
@@ -934,20 +1246,17 @@ pgstat_report_autovac(Oid dboid)
  * ---------
  */
 void
-pgstat_report_vacuum(Oid tableoid, bool shared,
-                                        bool analyze, PgStat_Counter tuples)
+pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter tuples)
 {
        PgStat_MsgVacuum msg;
 
-       if (pgStatSock < 0 ||
-               !pgstat_collect_tuplelevel)
+       if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
                return;
 
        pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
        msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
        msg.m_tableoid = tableoid;
-       msg.m_analyze = analyze;
-       msg.m_autovacuum = IsAutoVacuumProcess(); /* is this autovacuum? */
+       msg.m_autovacuum = IsAutoVacuumWorkerProcess();
        msg.m_vacuumtime = GetCurrentTimestamp();
        msg.m_tuples = tuples;
        pgstat_send(&msg, sizeof(msg));
@@ -960,25 +1269,69 @@ pgstat_report_vacuum(Oid tableoid, bool shared,
  * --------
  */
 void
-pgstat_report_analyze(Oid tableoid, bool shared, PgStat_Counter livetuples,
-                                         PgStat_Counter deadtuples)
+pgstat_report_analyze(Relation rel,
+                                         PgStat_Counter livetuples, PgStat_Counter deadtuples)
 {
        PgStat_MsgAnalyze msg;
 
-       if (pgStatSock < 0 ||
-               !pgstat_collect_tuplelevel)
+       if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
                return;
 
+       /*
+        * Unlike VACUUM, ANALYZE might be running inside a transaction that has
+        * already inserted and/or deleted rows in the target table. ANALYZE will
+        * have counted such rows as live or dead respectively. Because we will
+        * report our counts of such rows at transaction end, we should subtract
+        * off these counts from what we send to the collector now, else they'll
+        * be double-counted after commit.      (This approach also ensures that the
+        * collector ends up with the right numbers if we abort instead of
+        * committing.)
+        */
+       if (rel->pgstat_info != NULL)
+       {
+               PgStat_TableXactStatus *trans;
+
+               for (trans = rel->pgstat_info->trans; trans; trans = trans->upper)
+               {
+                       livetuples -= trans->tuples_inserted - trans->tuples_deleted;
+                       deadtuples -= trans->tuples_updated + trans->tuples_deleted;
+               }
+               /* count stuff inserted by already-aborted subxacts, too */
+               deadtuples -= rel->pgstat_info->t_counts.t_delta_dead_tuples;
+               /* Since ANALYZE's counts are estimates, we could have underflowed */
+               livetuples = Max(livetuples, 0);
+               deadtuples = Max(deadtuples, 0);
+       }
+
        pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
-       msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
-       msg.m_tableoid = tableoid;
-       msg.m_autovacuum = IsAutoVacuumProcess(); /* is this autovacuum? */
+       msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
+       msg.m_tableoid = RelationGetRelid(rel);
+       msg.m_autovacuum = IsAutoVacuumWorkerProcess();
        msg.m_analyzetime = GetCurrentTimestamp();
        msg.m_live_tuples = livetuples;
        msg.m_dead_tuples = deadtuples;
        pgstat_send(&msg, sizeof(msg));
 }
 
+/* --------
+ * pgstat_report_recovery_conflict() -
+ *
+ *     Tell the collector about a Hot Standby recovery conflict.
+ * --------
+ */
+void
+pgstat_report_recovery_conflict(int reason)
+{
+       PgStat_MsgRecoveryConflict msg;
+
+       if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+               return;
+
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYCONFLICT);
+       msg.m_databaseid = MyDatabaseId;
+       msg.m_reason = reason;
+       pgstat_send(&msg, sizeof(msg));
+}
 
 /* ----------
  * pgstat_ping() -
@@ -991,253 +1344,766 @@ pgstat_ping(void)
 {
        PgStat_MsgDummy msg;
 
-       if (pgStatSock < 0)
+       if (pgStatSock == PGINVALID_SOCKET)
                return;
 
        pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
        pgstat_send(&msg, sizeof(msg));
 }
 
-/*
- * Enlarge a TabStatArray
+/* ----------
+ * pgstat_send_inquiry() -
+ *
+ *     Notify collector that we need fresh data.
+ *     ts specifies the minimum acceptable timestamp for the stats file.
+ * ----------
  */
 static void
-more_tabstat_space(TabStatArray *tsarr)
+pgstat_send_inquiry(TimestampTz ts)
 {
-       PgStat_MsgTabstat *newMessages;
-       PgStat_MsgTabstat **msgArray;
-       int                     newAlloc;
-       int                     i;
+       PgStat_MsgInquiry msg;
+
+       pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY);
+       msg.inquiry_time = ts;
+       pgstat_send(&msg, sizeof(msg));
+}
+
 
-       AssertArg(PointerIsValid(tsarr));
+/*
+ * Initialize function call usage data.
+ * Called by the executor before invoking a function.
+ */
+void
+pgstat_init_function_usage(FunctionCallInfoData *fcinfo,
+                                                  PgStat_FunctionCallUsage *fcu)
+{
+       PgStat_BackendFunctionEntry *htabent;
+       bool            found;
+
+       if (pgstat_track_functions <= fcinfo->flinfo->fn_stats)
+       {
+               /* stats not wanted */
+               fcu->fs = NULL;
+               return;
+       }
 
-       newAlloc = tsarr->tsa_alloc + TABSTAT_QUANTUM;
+       if (!pgStatFunctions)
+       {
+               /* First time through - initialize function stat table */
+               HASHCTL         hash_ctl;
 
-       /* Create (another) quantum of message buffers */
-       newMessages = (PgStat_MsgTabstat *)
-               MemoryContextAllocZero(TopMemoryContext,
-                                                          sizeof(PgStat_MsgTabstat) * TABSTAT_QUANTUM);
+               memset(&hash_ctl, 0, sizeof(hash_ctl));
+               hash_ctl.keysize = sizeof(Oid);
+               hash_ctl.entrysize = sizeof(PgStat_BackendFunctionEntry);
+               hash_ctl.hash = oid_hash;
+               pgStatFunctions = hash_create("Function stat entries",
+                                                                         PGSTAT_FUNCTION_HASH_SIZE,
+                                                                         &hash_ctl,
+                                                                         HASH_ELEM | HASH_FUNCTION);
+       }
 
-       /* Create or enlarge the pointer array */
-       if (tsarr->tsa_messages == NULL)
-               msgArray = (PgStat_MsgTabstat **)
-                       MemoryContextAlloc(TopMemoryContext,
-                                                          sizeof(PgStat_MsgTabstat *) * newAlloc);
-       else
-               msgArray = (PgStat_MsgTabstat **)
-                       repalloc(tsarr->tsa_messages,
-                                        sizeof(PgStat_MsgTabstat *) * newAlloc);
+       /* Get the stats entry for this function, create if necessary */
+       htabent = hash_search(pgStatFunctions, &fcinfo->flinfo->fn_oid,
+                                                 HASH_ENTER, &found);
+       if (!found)
+               MemSet(&htabent->f_counts, 0, sizeof(PgStat_FunctionCounts));
+
+       fcu->fs = &htabent->f_counts;
+
+       /* save stats for this function, later used to compensate for recursion */
+       fcu->save_f_time = htabent->f_counts.f_time;
+
+       /* save current backend-wide total time */
+       fcu->save_total = total_func_time;
+
+       /* get clock time as of function start */
+       INSTR_TIME_SET_CURRENT(fcu->f_start);
+}
+
+/*
+ * find_funcstat_entry - find any existing PgStat_BackendFunctionEntry entry
+ *             for specified function
+ *
+ * If no entry, return NULL, don't create a new one
+ */
+PgStat_BackendFunctionEntry *
+find_funcstat_entry(Oid func_id)
+{
+       if (pgStatFunctions == NULL)
+               return NULL;
+
+       return (PgStat_BackendFunctionEntry *) hash_search(pgStatFunctions,
+                                                                                                          (void *) &func_id,
+                                                                                                          HASH_FIND, NULL);
+}
+
+/*
+ * Calculate function call usage and update stat counters.
+ * Called by the executor after invoking a function.
+ *
+ * In the case of a set-returning function that runs in value-per-call mode,
+ * we will see multiple pgstat_init_function_usage/pgstat_end_function_usage
+ * calls for what the user considers a single call of the function.  The
+ * finalize flag should be TRUE on the last call.
+ */
+void
+pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize)
+{
+       PgStat_FunctionCounts *fs = fcu->fs;
+       instr_time      f_total;
+       instr_time      f_others;
+       instr_time      f_self;
+
+       /* stats not wanted? */
+       if (fs == NULL)
+               return;
+
+       /* total elapsed time in this function call */
+       INSTR_TIME_SET_CURRENT(f_total);
+       INSTR_TIME_SUBTRACT(f_total, fcu->f_start);
 
-       for (i = 0; i < TABSTAT_QUANTUM; i++)
-               msgArray[tsarr->tsa_alloc + i] = newMessages++;
-       tsarr->tsa_messages = msgArray;
-       tsarr->tsa_alloc = newAlloc;
+       /* self usage: elapsed minus anything already charged to other calls */
+       f_others = total_func_time;
+       INSTR_TIME_SUBTRACT(f_others, fcu->save_total);
+       f_self = f_total;
+       INSTR_TIME_SUBTRACT(f_self, f_others);
 
-       Assert(tsarr->tsa_used < tsarr->tsa_alloc);
+       /* update backend-wide total time */
+       INSTR_TIME_ADD(total_func_time, f_self);
+
+       /*
+        * Compute the new total f_time as the total elapsed time added to the
+        * pre-call value of f_time.  This is necessary to avoid double-counting
+        * any time taken by recursive calls of myself.  (We do not need any
+        * similar kluge for self time, since that already excludes any recursive
+        * calls.)
+        */
+       INSTR_TIME_ADD(f_total, fcu->save_f_time);
+
+       /* update counters in function stats table */
+       if (finalize)
+               fs->f_numcalls++;
+       fs->f_time = f_total;
+       INSTR_TIME_ADD(fs->f_time_self, f_self);
+
+       /* indicate that we have something to send */
+       have_function_stats = true;
 }
 
+
 /* ----------
  * pgstat_initstats() -
  *
- *     Called from various places usually dealing with initialization
- *     of Relation or Scan structures. The data placed into these
- *     structures from here tell where later to count for buffer reads,
- *     scans and tuples fetched.
+ *     Initialize a relcache entry to count access statistics.
+ *     Called whenever a relation is opened.
+ *
+ *     We assume that a relcache entry's pgstat_info field is zeroed by
+ *     relcache.c when the relcache entry is made; thereafter it is long-lived
+ *     data.  We can avoid repeated searches of the TabStatus arrays when the
+ *     same relation is touched repeatedly within a transaction.
  * ----------
  */
 void
-pgstat_initstats(PgStat_Info *stats, Relation rel)
+pgstat_initstats(Relation rel)
 {
        Oid                     rel_id = rel->rd_id;
-       PgStat_TableEntry *useent;
-       TabStatArray *tsarr;
-       PgStat_MsgTabstat *tsmsg;
-       int                     mb;
-       int                     i;
+       char            relkind = rel->rd_rel->relkind;
+
+       /* We only count stats for things that have storage */
+       if (!(relkind == RELKIND_RELATION ||
+                 relkind == RELKIND_INDEX ||
+                 relkind == RELKIND_TOASTVALUE ||
+                 relkind == RELKIND_SEQUENCE))
+       {
+               rel->pgstat_info = NULL;
+               return;
+       }
+
+       if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+       {
+               /* We're not counting at all */
+               rel->pgstat_info = NULL;
+               return;
+       }
 
        /*
-        * Initialize data not to count at all.
+        * If we already set up this relation in the current transaction, nothing
+        * to do.
         */
-       stats->tabentry = NULL;
-
-       if (pgStatSock < 0 ||
-               !(pgstat_collect_tuplelevel ||
-                 pgstat_collect_blocklevel))
+       if (rel->pgstat_info != NULL &&
+               rel->pgstat_info->t_id == rel_id)
                return;
 
-       tsarr = rel->rd_rel->relisshared ? &SharedTabStat : &RegularTabStat;
+       /* Else find or make the PgStat_TableStatus entry, and update link */
+       rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
+}
+
+/*
+ * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
+ */
+static PgStat_TableStatus *
+get_tabstat_entry(Oid rel_id, bool isshared)
+{
+       PgStat_TableStatus *entry;
+       TabStatusArray *tsa;
+       TabStatusArray *prev_tsa;
+       int                     i;
 
        /*
-        * Search the already-used message slots for this relation.
+        * Search the already-used tabstat slots for this relation.
         */
-       for (mb = 0; mb < tsarr->tsa_used; mb++)
+       prev_tsa = NULL;
+       for (tsa = pgStatTabList; tsa != NULL; prev_tsa = tsa, tsa = tsa->tsa_next)
        {
-               tsmsg = tsarr->tsa_messages[mb];
-
-               for (i = tsmsg->m_nentries; --i >= 0;)
+               for (i = 0; i < tsa->tsa_used; i++)
                {
-                       if (tsmsg->m_entry[i].t_id == rel_id)
-                       {
-                               stats->tabentry = (void *) &(tsmsg->m_entry[i]);
-                               return;
-                       }
+                       entry = &tsa->tsa_entries[i];
+                       if (entry->t_id == rel_id)
+                               return entry;
                }
 
-               if (tsmsg->m_nentries >= PGSTAT_NUM_TABENTRIES)
-                       continue;
-
-               /*
-                * Not found, but found a message buffer with an empty slot instead.
-                * Fine, let's use this one.
-                */
-               i = tsmsg->m_nentries++;
-               useent = &tsmsg->m_entry[i];
-               MemSet(useent, 0, sizeof(PgStat_TableEntry));
-               useent->t_id = rel_id;
-               stats->tabentry = (void *) useent;
-               return;
+               if (tsa->tsa_used < TABSTAT_QUANTUM)
+               {
+                       /*
+                        * It must not be present, but we found a free slot instead. Fine,
+                        * let's use this one.  We assume the entry was already zeroed,
+                        * either at creation or after last use.
+                        */
+                       entry = &tsa->tsa_entries[tsa->tsa_used++];
+                       entry->t_id = rel_id;
+                       entry->t_shared = isshared;
+                       return entry;
+               }
        }
 
        /*
-        * If we ran out of message buffers, we just allocate more.
+        * We ran out of tabstat slots, so allocate more.  Be sure they're zeroed.
         */
-       if (tsarr->tsa_used >= tsarr->tsa_alloc)
-               more_tabstat_space(tsarr);
+       tsa = (TabStatusArray *) MemoryContextAllocZero(TopMemoryContext,
+                                                                                                       sizeof(TabStatusArray));
+       if (prev_tsa)
+               prev_tsa->tsa_next = tsa;
+       else
+               pgStatTabList = tsa;
 
        /*
-        * Use the first entry of the next message buffer.
+        * Use the first entry of the new TabStatusArray.
         */
-       mb = tsarr->tsa_used++;
-       tsmsg = tsarr->tsa_messages[mb];
-       tsmsg->m_nentries = 1;
-       useent = &tsmsg->m_entry[0];
-       MemSet(useent, 0, sizeof(PgStat_TableEntry));
-       useent->t_id = rel_id;
-       stats->tabentry = (void *) useent;
+       entry = &tsa->tsa_entries[tsa->tsa_used++];
+       entry->t_id = rel_id;
+       entry->t_shared = isshared;
+       return entry;
 }
 
-
-/* ----------
- * pgstat_count_xact_commit() -
+/*
+ * find_tabstat_entry - find any existing PgStat_TableStatus entry for rel
  *
- *     Called from access/transam/xact.c to count transaction commits.
- * ----------
+ * If no entry, return NULL, don't create a new one
  */
-void
-pgstat_count_xact_commit(void)
+PgStat_TableStatus *
+find_tabstat_entry(Oid rel_id)
 {
-       if      (!pgstat_collect_tuplelevel &&
-                !pgstat_collect_blocklevel)
-               return;
+       PgStat_TableStatus *entry;
+       TabStatusArray *tsa;
+       int                     i;
 
-       pgStatXactCommit++;
+       for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
+       {
+               for (i = 0; i < tsa->tsa_used; i++)
+               {
+                       entry = &tsa->tsa_entries[i];
+                       if (entry->t_id == rel_id)
+                               return entry;
+               }
+       }
 
-       /*
-        * If there was no relation activity yet, just make one existing message
-        * buffer used without slots, causing the next report to tell new
-        * xact-counters.
-        */
-       if (RegularTabStat.tsa_alloc == 0)
-               more_tabstat_space(&RegularTabStat);
+       /* Not present */
+       return NULL;
+}
+
+/*
+ * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed
+ */
+static PgStat_SubXactStatus *
+get_tabstat_stack_level(int nest_level)
+{
+       PgStat_SubXactStatus *xact_state;
 
-       if (RegularTabStat.tsa_used == 0)
+       xact_state = pgStatXactStack;
+       if (xact_state == NULL || xact_state->nest_level != nest_level)
        {
-               RegularTabStat.tsa_used++;
-               RegularTabStat.tsa_messages[0]->m_nentries = 0;
+               xact_state = (PgStat_SubXactStatus *)
+                       MemoryContextAlloc(TopTransactionContext,
+                                                          sizeof(PgStat_SubXactStatus));
+               xact_state->nest_level = nest_level;
+               xact_state->prev = pgStatXactStack;
+               xact_state->first = NULL;
+               pgStatXactStack = xact_state;
        }
+       return xact_state;
 }
 
+/*
+ * add_tabstat_xact_level - add a new (sub)transaction state record
+ */
+static void
+add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
+{
+       PgStat_SubXactStatus *xact_state;
+       PgStat_TableXactStatus *trans;
+
+       /*
+        * If this is the first rel to be modified at the current nest level, we
+        * first have to push a transaction stack entry.
+        */
+       xact_state = get_tabstat_stack_level(nest_level);
+
+       /* Now make a per-table stack entry */
+       trans = (PgStat_TableXactStatus *)
+               MemoryContextAllocZero(TopTransactionContext,
+                                                          sizeof(PgStat_TableXactStatus));
+       trans->nest_level = nest_level;
+       trans->upper = pgstat_info->trans;
+       trans->parent = pgstat_info;
+       trans->next = xact_state->first;
+       xact_state->first = trans;
+       pgstat_info->trans = trans;
+}
 
-/* ----------
- * pgstat_count_xact_rollback() -
- *
- *     Called from access/transam/xact.c to count transaction rollbacks.
- * ----------
+/*
+ * pgstat_count_heap_insert - count a tuple insertion
  */
 void
-pgstat_count_xact_rollback(void)
+pgstat_count_heap_insert(Relation rel)
 {
-       if      (!pgstat_collect_tuplelevel &&
-                !pgstat_collect_blocklevel)
-               return;
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
 
-       pgStatXactRollback++;
+       if (pgstat_info != NULL)
+       {
+               /* We have to log the effect at the proper transactional level */
+               int                     nest_level = GetCurrentTransactionNestLevel();
 
-       /*
-        * If there was no relation activity yet, just make one existing message
-        * buffer used without slots, causing the next report to tell new
-        * xact-counters.
-        */
-       if (RegularTabStat.tsa_alloc == 0)
-               more_tabstat_space(&RegularTabStat);
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               pgstat_info->trans->tuples_inserted++;
+       }
+}
+
+/*
+ * pgstat_count_heap_update - count a tuple update
+ */
+void
+pgstat_count_heap_update(Relation rel, bool hot)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
 
-       if (RegularTabStat.tsa_used == 0)
+       if (pgstat_info != NULL)
        {
-               RegularTabStat.tsa_used++;
-               RegularTabStat.tsa_messages[0]->m_nentries = 0;
+               /* We have to log the effect at the proper transactional level */
+               int                     nest_level = GetCurrentTransactionNestLevel();
+
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               pgstat_info->trans->tuples_updated++;
+
+               /* t_tuples_hot_updated is nontransactional, so just advance it */
+               if (hot)
+                       pgstat_info->t_counts.t_tuples_hot_updated++;
        }
 }
 
+/*
+ * pgstat_count_heap_delete - count a tuple deletion
+ */
+void
+pgstat_count_heap_delete(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
 
-/* ----------
- * pgstat_fetch_stat_dbentry() -
+       if (pgstat_info != NULL)
+       {
+               /* We have to log the effect at the proper transactional level */
+               int                     nest_level = GetCurrentTransactionNestLevel();
+
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               pgstat_info->trans->tuples_deleted++;
+       }
+}
+
+/*
+ * pgstat_update_heap_dead_tuples - update dead-tuples count
  *
- *     Support function for the SQL-callable pgstat* functions. Returns
- *     the collected statistics for one database or NULL. NULL doesn't mean
- *     that the database doesn't exist, it is just not yet known by the
- *     collector, so the caller is better off to report ZERO instead.
- * ----------
+ * The semantics of this are that we are reporting the nontransactional
+ * recovery of "delta" dead tuples; so t_delta_dead_tuples decreases
+ * rather than increasing, and the change goes straight into the per-table
+ * counter, not into transactional state.
  */
-PgStat_StatDBEntry *
-pgstat_fetch_stat_dbentry(Oid dbid)
+void
+pgstat_update_heap_dead_tuples(Relation rel, int delta)
 {
-       /*
-        * If not done for this transaction, read the statistics collector stats
-        * file into some hash tables.
-        */
-       backend_read_statsfile();
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
 
-       /*
-        * Lookup the requested database; return NULL if not found
-        */
-       return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                                                                         (void *) &dbid,
-                                                                                         HASH_FIND, NULL);
+       if (pgstat_info != NULL)
+               pgstat_info->t_counts.t_delta_dead_tuples -= delta;
 }
 
 
 /* ----------
- * pgstat_fetch_stat_tabentry() -
+ * AtEOXact_PgStat
  *
- *     Support function for the SQL-callable pgstat* functions. Returns
- *     the collected statistics for one table or NULL. NULL doesn't mean
- *     that the table doesn't exist, it is just not yet known by the
- *     collector, so the caller is better off to report ZERO instead.
+ *     Called from access/transam/xact.c at top-level transaction commit/abort.
  * ----------
  */
-PgStat_StatTabEntry *
-pgstat_fetch_stat_tabentry(Oid relid)
+void
+AtEOXact_PgStat(bool isCommit)
 {
-       Oid                     dbid;
-       PgStat_StatDBEntry *dbentry;
-       PgStat_StatTabEntry *tabentry;
+       PgStat_SubXactStatus *xact_state;
 
        /*
-        * If not done for this transaction, read the statistics collector stats
-        * file into some hash tables.
+        * Count transaction commit or abort.  (We use counters, not just bools,
+        * in case the reporting message isn't sent right away.)
         */
-       backend_read_statsfile();
+       if (isCommit)
+               pgStatXactCommit++;
+       else
+               pgStatXactRollback++;
 
        /*
-        * Lookup our database, then look in its table hash table.
+        * Transfer transactional insert/update counts into the base tabstat
+        * entries.  We don't bother to free any of the transactional state, since
+        * it's all in TopTransactionContext and will go away anyway.
         */
-       dbid = MyDatabaseId;
-       dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                                                                                (void *) &dbid,
-                                                                                                HASH_FIND, NULL);
-       if (dbentry != NULL && dbentry->tables != NULL)
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
        {
-               tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
+               PgStat_TableXactStatus *trans;
+
+               Assert(xact_state->nest_level == 1);
+               Assert(xact_state->prev == NULL);
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       Assert(trans->nest_level == 1);
+                       Assert(trans->upper == NULL);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+                       /* count attempted actions regardless of commit/abort */
+                       tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted;
+                       tabstat->t_counts.t_tuples_updated += trans->tuples_updated;
+                       tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted;
+                       if (isCommit)
+                       {
+                               /* insert adds a live tuple, delete removes one */
+                               tabstat->t_counts.t_delta_live_tuples +=
+                                       trans->tuples_inserted - trans->tuples_deleted;
+                               /* update and delete each create a dead tuple */
+                               tabstat->t_counts.t_delta_dead_tuples +=
+                                       trans->tuples_updated + trans->tuples_deleted;
+                               /* insert, update, delete each count as one change event */
+                               tabstat->t_counts.t_changed_tuples +=
+                                       trans->tuples_inserted + trans->tuples_updated +
+                                       trans->tuples_deleted;
+                       }
+                       else
+                       {
+                               /* inserted tuples are dead, deleted tuples are unaffected */
+                               tabstat->t_counts.t_delta_dead_tuples +=
+                                       trans->tuples_inserted + trans->tuples_updated;
+                               /* an aborted xact generates no changed_tuple events */
+                       }
+                       tabstat->trans = NULL;
+               }
+       }
+       pgStatXactStack = NULL;
+
+       /* Make sure any stats snapshot is thrown away */
+       pgstat_clear_snapshot();
+}
+
+/* ----------
+ * AtEOSubXact_PgStat
+ *
+ *     Called from access/transam/xact.c at subtransaction commit/abort.
+ * ----------
+ */
+void
+AtEOSubXact_PgStat(bool isCommit, int nestDepth)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       /*
+        * Transfer transactional insert/update counts into the next higher
+        * subtransaction state.
+        */
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL &&
+               xact_state->nest_level >= nestDepth)
+       {
+               PgStat_TableXactStatus *trans;
+               PgStat_TableXactStatus *next_trans;
+
+               /* delink xact_state from stack immediately to simplify reuse case */
+               pgStatXactStack = xact_state->prev;
+
+               for (trans = xact_state->first; trans != NULL; trans = next_trans)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       next_trans = trans->next;
+                       Assert(trans->nest_level == nestDepth);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+                       if (isCommit)
+                       {
+                               if (trans->upper && trans->upper->nest_level == nestDepth - 1)
+                               {
+                                       trans->upper->tuples_inserted += trans->tuples_inserted;
+                                       trans->upper->tuples_updated += trans->tuples_updated;
+                                       trans->upper->tuples_deleted += trans->tuples_deleted;
+                                       tabstat->trans = trans->upper;
+                                       pfree(trans);
+                               }
+                               else
+                               {
+                                       /*
+                                        * When there isn't an immediate parent state, we can just
+                                        * reuse the record instead of going through a
+                                        * palloc/pfree pushup (this works since it's all in
+                                        * TopTransactionContext anyway).  We have to re-link it
+                                        * into the parent level, though, and that might mean
+                                        * pushing a new entry into the pgStatXactStack.
+                                        */
+                                       PgStat_SubXactStatus *upper_xact_state;
+
+                                       upper_xact_state = get_tabstat_stack_level(nestDepth - 1);
+                                       trans->next = upper_xact_state->first;
+                                       upper_xact_state->first = trans;
+                                       trans->nest_level = nestDepth - 1;
+                               }
+                       }
+                       else
+                       {
+                               /*
+                                * On abort, update top-level tabstat counts, then forget the
+                                * subtransaction
+                                */
+
+                               /* count attempted actions regardless of commit/abort */
+                               tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted;
+                               tabstat->t_counts.t_tuples_updated += trans->tuples_updated;
+                               tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted;
+                               /* inserted tuples are dead, deleted tuples are unaffected */
+                               tabstat->t_counts.t_delta_dead_tuples +=
+                                       trans->tuples_inserted + trans->tuples_updated;
+                               tabstat->trans = trans->upper;
+                               pfree(trans);
+                       }
+               }
+               pfree(xact_state);
+       }
+}
+
+
+/*
+ * AtPrepare_PgStat
+ *             Save the transactional stats state at 2PC transaction prepare.
+ *
+ * In this phase we just generate 2PC records for all the pending
+ * transaction-dependent stats work.
+ */
+void
+AtPrepare_PgStat(void)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
+       {
+               PgStat_TableXactStatus *trans;
+
+               Assert(xact_state->nest_level == 1);
+               Assert(xact_state->prev == NULL);
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+                       TwoPhasePgStatRecord record;
+
+                       Assert(trans->nest_level == 1);
+                       Assert(trans->upper == NULL);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+
+                       record.tuples_inserted = trans->tuples_inserted;
+                       record.tuples_updated = trans->tuples_updated;
+                       record.tuples_deleted = trans->tuples_deleted;
+                       record.t_id = tabstat->t_id;
+                       record.t_shared = tabstat->t_shared;
+
+                       RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0,
+                                                                  &record, sizeof(TwoPhasePgStatRecord));
+               }
+       }
+}
+
+/*
+ * PostPrepare_PgStat
+ *             Clean up after successful PREPARE.
+ *
+ * All we need do here is unlink the transaction stats state from the
+ * nontransactional state.     The nontransactional action counts will be
+ * reported to the stats collector immediately, while the effects on live
+ * and dead tuple counts are preserved in the 2PC state file.
+ *
+ * Note: AtEOXact_PgStat is not called during PREPARE.
+ */
+void
+PostPrepare_PgStat(void)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       /*
+        * We don't bother to free any of the transactional state, since it's all
+        * in TopTransactionContext and will go away anyway.
+        */
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
+       {
+               PgStat_TableXactStatus *trans;
+
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       tabstat = trans->parent;
+                       tabstat->trans = NULL;
+               }
+       }
+       pgStatXactStack = NULL;
+
+       /* Make sure any stats snapshot is thrown away */
+       pgstat_clear_snapshot();
+}
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state.
+ */
+void
+pgstat_twophase_postcommit(TransactionId xid, uint16 info,
+                                                  void *recdata, uint32 len)
+{
+       TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+       PgStat_TableStatus *pgstat_info;
+
+       /* Find or create a tabstat entry for the rel */
+       pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+       /* Same math as in AtEOXact_PgStat, commit case */
+       pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted;
+       pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated;
+       pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted;
+       pgstat_info->t_counts.t_delta_live_tuples +=
+               rec->tuples_inserted - rec->tuples_deleted;
+       pgstat_info->t_counts.t_delta_dead_tuples +=
+               rec->tuples_updated + rec->tuples_deleted;
+       pgstat_info->t_counts.t_changed_tuples +=
+               rec->tuples_inserted + rec->tuples_updated +
+               rec->tuples_deleted;
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state, but treat them
+ * as aborted.
+ */
+void
+pgstat_twophase_postabort(TransactionId xid, uint16 info,
+                                                 void *recdata, uint32 len)
+{
+       TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+       PgStat_TableStatus *pgstat_info;
+
+       /* Find or create a tabstat entry for the rel */
+       pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+       /* Same math as in AtEOXact_PgStat, abort case */
+       pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted;
+       pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated;
+       pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted;
+       pgstat_info->t_counts.t_delta_dead_tuples +=
+               rec->tuples_inserted + rec->tuples_updated;
+}
+
+
+/* ----------
+ * pgstat_fetch_stat_dbentry() -
+ *
+ *     Support function for the SQL-callable pgstat* functions. Returns
+ *     the collected statistics for one database or NULL. NULL doesn't mean
+ *     that the database doesn't exist, it is just not yet known by the
+ *     collector, so the caller is better off to report ZERO instead.
+ * ----------
+ */
+PgStat_StatDBEntry *
+pgstat_fetch_stat_dbentry(Oid dbid)
+{
+       /*
+        * If not done for this transaction, read the statistics collector stats
+        * file into some hash tables.
+        */
+       backend_read_statsfile();
+
+       /*
+        * Lookup the requested database; return NULL if not found
+        */
+       return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
+                                                                                         (void *) &dbid,
+                                                                                         HASH_FIND, NULL);
+}
+
+
+/* ----------
+ * pgstat_fetch_stat_tabentry() -
+ *
+ *     Support function for the SQL-callable pgstat* functions. Returns
+ *     the collected statistics for one table or NULL. NULL doesn't mean
+ *     that the table doesn't exist, it is just not yet known by the
+ *     collector, so the caller is better off to report ZERO instead.
+ * ----------
+ */
+PgStat_StatTabEntry *
+pgstat_fetch_stat_tabentry(Oid relid)
+{
+       Oid                     dbid;
+       PgStat_StatDBEntry *dbentry;
+       PgStat_StatTabEntry *tabentry;
+
+       /*
+        * If not done for this transaction, read the statistics collector stats
+        * file into some hash tables.
+        */
+       backend_read_statsfile();
+
+       /*
+        * Lookup our database, then look in its table hash table.
+        */
+       dbid = MyDatabaseId;
+       dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
+                                                                                                (void *) &dbid,
+                                                                                                HASH_FIND, NULL);
+       if (dbentry != NULL && dbentry->tables != NULL)
+       {
+               tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
                                                                                                           (void *) &relid,
                                                                                                           HASH_FIND, NULL);
                if (tabentry)
@@ -1264,6 +2130,35 @@ pgstat_fetch_stat_tabentry(Oid relid)
 }
 
 
+/* ----------
+ * pgstat_fetch_stat_funcentry() -
+ *
+ *     Support function for the SQL-callable pgstat* functions. Returns
+ *     the collected statistics for one function or NULL.
+ * ----------
+ */
+PgStat_StatFuncEntry *
+pgstat_fetch_stat_funcentry(Oid func_id)
+{
+       PgStat_StatDBEntry *dbentry;
+       PgStat_StatFuncEntry *funcentry = NULL;
+
+       /* load the stats file if needed */
+       backend_read_statsfile();
+
+       /* Lookup our database, then find the requested function.  */
+       dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
+       if (dbentry != NULL && dbentry->functions != NULL)
+       {
+               funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
+                                                                                                                (void *) &func_id,
+                                                                                                                HASH_FIND, NULL);
+       }
+
+       return funcentry;
+}
+
+
 /* ----------
  * pgstat_fetch_stat_beentry() -
  *
@@ -1301,6 +2196,22 @@ pgstat_fetch_stat_numbackends(void)
        return localNumBackends;
 }
 
+/*
+ * ---------
+ * pgstat_fetch_global() -
+ *
+ *     Support function for the SQL-callable pgstat* functions. Returns
+ *     a pointer to the global statistics struct.
+ * ---------
+ */
+PgStat_GlobalStats *
+pgstat_fetch_global(void)
+{
+       backend_read_statsfile();
+
+       return &globalStats;
+}
+
 
 /* ------------------------------------------------------------
  * Functions for management of the shared-memory PgBackendStatus array
@@ -1309,6 +2220,9 @@ pgstat_fetch_stat_numbackends(void)
 
 static PgBackendStatus *BackendStatusArray = NULL;
 static PgBackendStatus *MyBEEntry = NULL;
+static char *BackendClientHostnameBuffer = NULL;
+static char *BackendAppnameBuffer = NULL;
+static char *BackendActivityBuffer = NULL;
 
 
 /*
@@ -1320,19 +2234,29 @@ BackendStatusShmemSize(void)
        Size            size;
 
        size = mul_size(sizeof(PgBackendStatus), MaxBackends);
+       size = add_size(size,
+                                       mul_size(NAMEDATALEN, MaxBackends));
+       size = add_size(size,
+                                       mul_size(pgstat_track_activity_query_size, MaxBackends));
+       size = add_size(size,
+                                       mul_size(NAMEDATALEN, MaxBackends));
        return size;
 }
 
 /*
- * Initialize the shared status array during postmaster startup.
+ * Initialize the shared status array and several string buffers
+ * during postmaster startup.
  */
 void
 CreateSharedBackendStatus(void)
 {
-       Size            size = BackendStatusShmemSize();
+       Size            size;
        bool            found;
+       int                     i;
+       char       *buffer;
 
        /* Create or attach to the shared array */
+       size = mul_size(sizeof(PgBackendStatus), MaxBackends);
        BackendStatusArray = (PgBackendStatus *)
                ShmemInitStruct("Backend Status Array", size, &found);
 
@@ -1343,31 +2267,104 @@ CreateSharedBackendStatus(void)
                 */
                MemSet(BackendStatusArray, 0, size);
        }
+
+       /* Create or attach to the shared appname buffer */
+       size = mul_size(NAMEDATALEN, MaxBackends);
+       BackendAppnameBuffer = (char *)
+               ShmemInitStruct("Backend Application Name Buffer", size, &found);
+
+       if (!found)
+       {
+               MemSet(BackendAppnameBuffer, 0, size);
+
+               /* Initialize st_appname pointers. */
+               buffer = BackendAppnameBuffer;
+               for (i = 0; i < MaxBackends; i++)
+               {
+                       BackendStatusArray[i].st_appname = buffer;
+                       buffer += NAMEDATALEN;
+               }
+       }
+
+       /* Create or attach to the shared client hostname buffer */
+       size = mul_size(NAMEDATALEN, MaxBackends);
+       BackendClientHostnameBuffer = (char *)
+               ShmemInitStruct("Backend Client Host Name Buffer", size, &found);
+
+       if (!found)
+       {
+               MemSet(BackendClientHostnameBuffer, 0, size);
+
+               /* Initialize st_clienthostname pointers. */
+               buffer = BackendClientHostnameBuffer;
+               for (i = 0; i < MaxBackends; i++)
+               {
+                       BackendStatusArray[i].st_clienthostname = buffer;
+                       buffer += NAMEDATALEN;
+               }
+       }
+
+       /* Create or attach to the shared activity buffer */
+       size = mul_size(pgstat_track_activity_query_size, MaxBackends);
+       BackendActivityBuffer = (char *)
+               ShmemInitStruct("Backend Activity Buffer", size, &found);
+
+       if (!found)
+       {
+               MemSet(BackendActivityBuffer, 0, size);
+
+               /* Initialize st_activity pointers. */
+               buffer = BackendActivityBuffer;
+               for (i = 0; i < MaxBackends; i++)
+               {
+                       BackendStatusArray[i].st_activity = buffer;
+                       buffer += pgstat_track_activity_query_size;
+               }
+       }
 }
 
 
+/* ----------
+ * pgstat_initialize() -
+ *
+ *     Initialize pgstats state, and set up our on-proc-exit hook.
+ *     Called from InitPostgres.  MyBackendId must be set,
+ *     but we must not have started any transaction yet (since the
+ *     exit hook must run after the last transaction exit).
+ *     NOTE: MyDatabaseId isn't set yet; so the shutdown hook has to be careful.
+ * ----------
+ */
+void
+pgstat_initialize(void)
+{
+       /* Initialize MyBEEntry */
+       Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends);
+       MyBEEntry = &BackendStatusArray[MyBackendId - 1];
+
+       /* Set up a process-exit hook to clean up */
+       on_shmem_exit(pgstat_beshutdown_hook, 0);
+}
+
 /* ----------
  * pgstat_bestart() -
  *
- *     Initialize this backend's entry in the PgBackendStatus array,
- *     and set up an on-proc-exit hook that will clear it again.
- *     Called from InitPostgres.  MyBackendId and MyDatabaseId must be set.
+ *     Initialize this backend's entry in the PgBackendStatus array.
+ *     Called from InitPostgres.
+ *     MyDatabaseId, session userid, and application_name must be set
+ *     (hence, this cannot be combined with pgstat_initialize).
  * ----------
  */
 void
 pgstat_bestart(void)
 {
-       volatile PgBackendStatus *beentry;
        TimestampTz proc_start_timestamp;
        Oid                     userid;
        SockAddr        clientaddr;
-
-       Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends);
-       MyBEEntry = &BackendStatusArray[MyBackendId - 1];
+       volatile PgBackendStatus *beentry;
 
        /*
-        * To minimize the time spent modifying the entry, fetch all the
-        * needed data first.
+        * To minimize the time spent modifying the PgBackendStatus entry, fetch
+        * all the needed data first.
         *
         * If we have a MyProcPort, use its session start time (for consistency,
         * and to save a kernel call).
@@ -1390,32 +2387,41 @@ pgstat_bestart(void)
 
        /*
         * Initialize my status entry, following the protocol of bumping
-        * st_changecount before and after; and make sure it's even afterwards.
-        * We use a volatile pointer here to ensure the compiler doesn't try to
-        * get cute.
+        * st_changecount before and after; and make sure it's even afterwards. We
+        * use a volatile pointer here to ensure the compiler doesn't try to get
+        * cute.
         */
        beentry = MyBEEntry;
-       do {
+       do
+       {
                beentry->st_changecount++;
        } while ((beentry->st_changecount & 1) == 0);
 
        beentry->st_procpid = MyProcPid;
        beentry->st_proc_start_timestamp = proc_start_timestamp;
        beentry->st_activity_start_timestamp = 0;
+       beentry->st_xact_start_timestamp = 0;
        beentry->st_databaseid = MyDatabaseId;
        beentry->st_userid = userid;
        beentry->st_clientaddr = clientaddr;
+       beentry->st_clienthostname[0] = '\0';
+       beentry->st_waiting = false;
+       beentry->st_appname[0] = '\0';
        beentry->st_activity[0] = '\0';
-       /* Also make sure the last byte in the string area is always 0 */
-       beentry->st_activity[PGBE_ACTIVITY_SIZE - 1] = '\0';
+       /* Also make sure the last byte in each string area is always 0 */
+       beentry->st_clienthostname[NAMEDATALEN - 1] = '\0';
+       beentry->st_appname[NAMEDATALEN - 1] = '\0';
+       beentry->st_activity[pgstat_track_activity_query_size - 1] = '\0';
 
        beentry->st_changecount++;
        Assert((beentry->st_changecount & 1) == 0);
 
-       /*
-        * Set up a process-exit hook to clean up.
-        */
-       on_shmem_exit(pgstat_beshutdown_hook, 0);
+       if (MyProcPort && MyProcPort->remote_hostname)
+               strlcpy(beentry->st_clienthostname, MyProcPort->remote_hostname, NAMEDATALEN);
+
+       /* Update app name to current GUC setting */
+       if (application_name)
+               pgstat_report_appname(application_name);
 }
 
 /*
@@ -1430,16 +2436,22 @@ pgstat_bestart(void)
 static void
 pgstat_beshutdown_hook(int code, Datum arg)
 {
-       volatile PgBackendStatus *beentry;
+       volatile PgBackendStatus *beentry = MyBEEntry;
 
-       pgstat_report_tabstat();
+       /*
+        * If we got as far as discovering our own database ID, we can report what
+        * we did to the collector.  Otherwise, we'd be sending an invalid
+        * database ID, so forget it.  (This means that accesses to pg_database
+        * during failed backend starts might never get counted.)
+        */
+       if (OidIsValid(MyDatabaseId))
+               pgstat_report_stat(true);
 
        /*
-        * Clear my status entry, following the protocol of bumping
-        * st_changecount before and after.  We use a volatile pointer here
-        * to ensure the compiler doesn't try to get cute.
+        * Clear my status entry, following the protocol of bumping st_changecount
+        * before and after.  We use a volatile pointer here to ensure the
+        * compiler doesn't try to get cute.
         */
-       beentry = MyBEEntry;
        beentry->st_changecount++;
 
        beentry->st_procpid = 0;        /* mark invalid */
@@ -1459,28 +2471,29 @@ pgstat_beshutdown_hook(int code, Datum arg)
 void
 pgstat_report_activity(const char *cmd_str)
 {
-       volatile PgBackendStatus *beentry;
+       volatile PgBackendStatus *beentry = MyBEEntry;
        TimestampTz start_timestamp;
        int                     len;
 
-       if (!pgstat_collect_querystring)
+       TRACE_POSTGRESQL_STATEMENT_STATUS(cmd_str);
+
+       if (!pgstat_track_activities || !beentry)
                return;
 
        /*
-        * To minimize the time spent modifying the entry, fetch all the
-        * needed data first.
+        * To minimize the time spent modifying the entry, fetch all the needed
+        * data first.
         */
        start_timestamp = GetCurrentStatementStartTimestamp();
 
        len = strlen(cmd_str);
-       len = pg_mbcliplen(cmd_str, len, PGBE_ACTIVITY_SIZE - 1);
+       len = pg_mbcliplen(cmd_str, len, pgstat_track_activity_query_size - 1);
 
        /*
         * Update my status entry, following the protocol of bumping
-        * st_changecount before and after.  We use a volatile pointer here
-        * to ensure the compiler doesn't try to get cute.
+        * st_changecount before and after.  We use a volatile pointer here to
+        * ensure the compiler doesn't try to get cute.
         */
-       beentry = MyBEEntry;
        beentry->st_changecount++;
 
        beentry->st_activity_start_timestamp = start_timestamp;
@@ -1491,589 +2504,521 @@ pgstat_report_activity(const char *cmd_str)
        Assert((beentry->st_changecount & 1) == 0);
 }
 
-
 /* ----------
- * pgstat_read_current_status() -
+ * pgstat_report_appname() -
  *
- *     Copy the current contents of the PgBackendStatus array to local memory,
- *     if not already done in this transaction.
+ *     Called to update our application name.
  * ----------
  */
-static void
-pgstat_read_current_status(void)
+void
+pgstat_report_appname(const char *appname)
 {
-       TransactionId topXid = GetTopTransactionId();
-       volatile PgBackendStatus *beentry;
-       PgBackendStatus *localentry;
-       int                     i;
+       volatile PgBackendStatus *beentry = MyBEEntry;
+       int                     len;
 
-       Assert(!pgStatRunningInCollector);
-       if (TransactionIdEquals(pgStatLocalStatusXact, topXid))
-               return;                                 /* already done */
+       if (!beentry)
+               return;
 
-       localBackendStatusTable = (PgBackendStatus *)
-               MemoryContextAlloc(TopTransactionContext,
-                                                  sizeof(PgBackendStatus) * MaxBackends);
-       localNumBackends = 0;
+       /* This should be unnecessary if GUC did its job, but be safe */
+       len = pg_mbcliplen(appname, strlen(appname), NAMEDATALEN - 1);
 
-       beentry = BackendStatusArray;
-       localentry = localBackendStatusTable;
-       for (i = 1; i <= MaxBackends; i++)
-       {
-               /*
-                * Follow the protocol of retrying if st_changecount changes while
-                * we copy the entry, or if it's odd.  (The check for odd is needed
-                * to cover the case where we are able to completely copy the entry
-                * while the source backend is between increment steps.)  We use a
-                * volatile pointer here to ensure the compiler doesn't try to get
-                * cute.
-                */
-               for (;;)
-               {
-                       int             save_changecount = beentry->st_changecount;
+       /*
+        * Update my status entry, following the protocol of bumping
+        * st_changecount before and after.  We use a volatile pointer here to
+        * ensure the compiler doesn't try to get cute.
+        */
+       beentry->st_changecount++;
 
-                       /*
-                        * XXX if PGBE_ACTIVITY_SIZE is really large, it might be best
-                        * to use strcpy not memcpy for copying the activity string?
-                        */
-                       memcpy(localentry, (char *) beentry, sizeof(PgBackendStatus));
-
-                       if (save_changecount == beentry->st_changecount &&
-                               (save_changecount & 1) == 0)
-                               break;
-
-                       /* Make sure we can break out of loop if stuck... */
-                       CHECK_FOR_INTERRUPTS();
-               }
-
-               beentry++;
-               /* Only valid entries get included into the local array */
-               if (localentry->st_procpid > 0)
-               {
-                       localentry++;
-                       localNumBackends++;
-               }
-       }
+       memcpy((char *) beentry->st_appname, appname, len);
+       beentry->st_appname[len] = '\0';
 
-       pgStatLocalStatusXact = topXid;
+       beentry->st_changecount++;
+       Assert((beentry->st_changecount & 1) == 0);
 }
 
-
-/* ------------------------------------------------------------
- * Local support functions follow
- * ------------------------------------------------------------
+/*
+ * Report current transaction start timestamp as the specified value.
+ * Zero means there is no active transaction.
  */
+void
+pgstat_report_xact_timestamp(TimestampTz tstamp)
+{
+       volatile PgBackendStatus *beentry = MyBEEntry;
 
+       if (!pgstat_track_activities || !beentry)
+               return;
 
-/* ----------
- * pgstat_setheader() -
- *
- *             Set common header fields in a statistics message
- * ----------
- */
-static void
-pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype)
-{
-       hdr->m_type = mtype;
+       /*
+        * Update my status entry, following the protocol of bumping
+        * st_changecount before and after.  We use a volatile pointer here to
+        * ensure the compiler doesn't try to get cute.
+        */
+       beentry->st_changecount++;
+       beentry->st_xact_start_timestamp = tstamp;
+       beentry->st_changecount++;
+       Assert((beentry->st_changecount & 1) == 0);
 }
 
-
 /* ----------
- * pgstat_send() -
+ * pgstat_report_waiting() -
  *
- *             Send out one statistics message to the collector
+ *     Called from lock manager to report beginning or end of a lock wait.
+ *
+ * NB: this *must* be able to survive being called before MyBEEntry has been
+ * initialized.
  * ----------
  */
-static void
-pgstat_send(void *msg, int len)
+void
+pgstat_report_waiting(bool waiting)
 {
-       if (pgStatSock < 0)
-               return;
+       volatile PgBackendStatus *beentry = MyBEEntry;
 
-       ((PgStat_MsgHdr *) msg)->m_size = len;
+       if (!pgstat_track_activities || !beentry)
+               return;
 
-#ifdef USE_ASSERT_CHECKING
-       if (send(pgStatSock, msg, len, 0) < 0)
-               elog(LOG, "could not send to statistics collector: %m");
-#else
-       send(pgStatSock, msg, len, 0);
-       /* We deliberately ignore any error from send() */
-#endif
+       /*
+        * Since this is a single-byte field in a struct that only this process
+        * may modify, there seems no need to bother with the st_changecount
+        * protocol.  The update must appear atomic in any case.
+        */
+       beentry->st_waiting = waiting;
 }
 
 
 /* ----------
- * PgstatBufferMain() -
- *
- *     Start up the statistics buffer process.  This is the body of the
- *     postmaster child process.
+ * pgstat_read_current_status() -
  *
- *     The argc/argv parameters are valid only in EXEC_BACKEND case.
+ *     Copy the current contents of the PgBackendStatus array to local memory,
+ *     if not already done in this transaction.
  * ----------
  */
-NON_EXEC_STATIC void
-PgstatBufferMain(int argc, char *argv[])
+static void
+pgstat_read_current_status(void)
 {
-       IsUnderPostmaster = true;       /* we are a postmaster subprocess now */
-
-       MyProcPid = getpid();           /* reset MyProcPid */
+       volatile PgBackendStatus *beentry;
+       PgBackendStatus *localtable;
+       PgBackendStatus *localentry;
+       char       *localappname,
+                          *localactivity;
+       int                     i;
 
-       /*
-        * Ignore all signals usually bound to some action in the postmaster,
-        * except for SIGCHLD and SIGQUIT --- see pgstat_recvbuffer.
-        */
-       pqsignal(SIGHUP, SIG_IGN);
-       pqsignal(SIGINT, SIG_IGN);
-       pqsignal(SIGTERM, SIG_IGN);
-       pqsignal(SIGQUIT, pgstat_exit);
-       pqsignal(SIGALRM, SIG_IGN);
-       pqsignal(SIGPIPE, SIG_IGN);
-       pqsignal(SIGUSR1, SIG_IGN);
-       pqsignal(SIGUSR2, SIG_IGN);
-       pqsignal(SIGCHLD, pgstat_die);
-       pqsignal(SIGTTIN, SIG_DFL);
-       pqsignal(SIGTTOU, SIG_DFL);
-       pqsignal(SIGCONT, SIG_DFL);
-       pqsignal(SIGWINCH, SIG_DFL);
-       /* unblock will happen in pgstat_recvbuffer */
+       Assert(!pgStatRunningInCollector);
+       if (localBackendStatusTable)
+               return;                                 /* already done */
 
-#ifdef EXEC_BACKEND
-       pgstat_parseArgs(argc, argv);
-#endif
+       pgstat_setup_memcxt();
 
-       /*
-        * Start a buffering process to read from the socket, so we have a little
-        * more time to process incoming messages.
-        *
-        * NOTE: the process structure is: postmaster is parent of buffer process
-        * is parent of collector process.      This way, the buffer can detect
-        * collector failure via SIGCHLD, whereas otherwise it wouldn't notice
-        * collector failure until it tried to write on the pipe.  That would mean
-        * that after the postmaster started a new collector, we'd have two buffer
-        * processes competing to read from the UDP socket --- not good.
-        */
-       if (pgpipe(pgStatPipe) < 0)
-               ereport(ERROR,
-                               (errcode_for_socket_access(),
-                                errmsg("could not create pipe for statistics buffer: %m")));
+       localtable = (PgBackendStatus *)
+               MemoryContextAlloc(pgStatLocalContext,
+                                                  sizeof(PgBackendStatus) * MaxBackends);
+       localappname = (char *)
+               MemoryContextAlloc(pgStatLocalContext,
+                                                  NAMEDATALEN * MaxBackends);
+       localactivity = (char *)
+               MemoryContextAlloc(pgStatLocalContext,
+                                                  pgstat_track_activity_query_size * MaxBackends);
+       localNumBackends = 0;
 
-       /* child becomes collector process */
-#ifdef EXEC_BACKEND
-       pgStatCollectorPid = pgstat_forkexec(STAT_PROC_COLLECTOR);
-#else
-       pgStatCollectorPid = fork();
-#endif
-       switch (pgStatCollectorPid)
+       beentry = BackendStatusArray;
+       localentry = localtable;
+       for (i = 1; i <= MaxBackends; i++)
        {
-               case -1:
-                       ereport(ERROR,
-                                       (errmsg("could not fork statistics collector: %m")));
+               /*
+                * Follow the protocol of retrying if st_changecount changes while we
+                * copy the entry, or if it's odd.  (The check for odd is needed to
+                * cover the case where we are able to completely copy the entry while
+                * the source backend is between increment steps.)      We use a volatile
+                * pointer here to ensure the compiler doesn't try to get cute.
+                */
+               for (;;)
+               {
+                       int                     save_changecount = beentry->st_changecount;
 
-#ifndef EXEC_BACKEND
-               case 0:
-                       /* child becomes collector process */
-                       PgstatCollectorMain(0, NULL);
-                       break;
-#endif
+                       localentry->st_procpid = beentry->st_procpid;
+                       if (localentry->st_procpid > 0)
+                       {
+                               memcpy(localentry, (char *) beentry, sizeof(PgBackendStatus));
 
-               default:
-                       /* parent becomes buffer process */
-                       closesocket(pgStatPipe[0]);
-                       pgstat_recvbuffer();
+                               /*
+                                * strcpy is safe even if the string is modified concurrently,
+                                * because there's always a \0 at the end of the buffer.
+                                */
+                               strcpy(localappname, (char *) beentry->st_appname);
+                               localentry->st_appname = localappname;
+                               strcpy(localactivity, (char *) beentry->st_activity);
+                               localentry->st_activity = localactivity;
+                       }
+
+                       if (save_changecount == beentry->st_changecount &&
+                               (save_changecount & 1) == 0)
+                               break;
+
+                       /* Make sure we can break out of loop if stuck... */
+                       CHECK_FOR_INTERRUPTS();
+               }
+
+               beentry++;
+               /* Only valid entries get included into the local array */
+               if (localentry->st_procpid > 0)
+               {
+                       localentry++;
+                       localappname += NAMEDATALEN;
+                       localactivity += pgstat_track_activity_query_size;
+                       localNumBackends++;
+               }
        }
-       exit(0);
+
+       /* Set the pointer only after completion of a valid table */
+       localBackendStatusTable = localtable;
 }
 
 
 /* ----------
- * PgstatCollectorMain() -
+ * pgstat_get_backend_current_activity() -
  *
- *     Start up the statistics collector itself.  This is the body of the
- *     postmaster grandchild process.
+ *     Return a string representing the current activity of the backend with
+ *     the specified PID.      This looks directly at the BackendStatusArray,
+ *     and so will provide current information regardless of the age of our
+ *     transaction's snapshot of the status array.
  *
- *     The argc/argv parameters are valid only in EXEC_BACKEND case.
+ *     It is the caller's responsibility to invoke this only for backends whose
+ *     state is expected to remain stable while the result is in use.  The
+ *     only current use is in deadlock reporting, where we can expect that
+ *     the target backend is blocked on a lock.  (There are corner cases
+ *     where the target's wait could get aborted while we are looking at it,
+ *     but the very worst consequence is to return a pointer to a string
+ *     that's been changed, so we won't worry too much.)
+ *
+ *     Note: return strings for special cases match pg_stat_get_backend_activity.
  * ----------
  */
-NON_EXEC_STATIC void
-PgstatCollectorMain(int argc, char *argv[])
+const char *
+pgstat_get_backend_current_activity(int pid, bool checkUser)
 {
-       PgStat_Msg      msg;
-       fd_set          rfds;
-       int                     readPipe;
-       int                     len = 0;
-       struct itimerval timeout;
-       bool            need_timer = false;
-
-       MyProcPid = getpid();           /* reset MyProcPid */
-
-       /*
-        * Reset signal handling.  With the exception of restoring default SIGCHLD
-        * and SIGQUIT handling, this is a no-op in the non-EXEC_BACKEND case
-        * because we'll have inherited these settings from the buffer process;
-        * but it's not a no-op for EXEC_BACKEND.
-        */
-       pqsignal(SIGHUP, SIG_IGN);
-       pqsignal(SIGINT, SIG_IGN);
-       pqsignal(SIGTERM, SIG_IGN);
-#ifndef WIN32
-       pqsignal(SIGQUIT, SIG_IGN);
-#else
-       /* kluge to allow buffer process to kill collector; FIXME */
-       pqsignal(SIGQUIT, pgstat_exit);
-#endif
-       pqsignal(SIGALRM, force_statwrite);
-       pqsignal(SIGPIPE, SIG_IGN);
-       pqsignal(SIGUSR1, SIG_IGN);
-       pqsignal(SIGUSR2, SIG_IGN);
-       pqsignal(SIGCHLD, SIG_DFL);
-       pqsignal(SIGTTIN, SIG_DFL);
-       pqsignal(SIGTTOU, SIG_DFL);
-       pqsignal(SIGCONT, SIG_DFL);
-       pqsignal(SIGWINCH, SIG_DFL);
-       PG_SETMASK(&UnBlockSig);
-
-#ifdef EXEC_BACKEND
-       pgstat_parseArgs(argc, argv);
-#endif
-
-       /* Close unwanted files */
-       closesocket(pgStatPipe[1]);
-       closesocket(pgStatSock);
-
-       /*
-        * Identify myself via ps
-        */
-       init_ps_display("stats collector process", "", "");
-       set_ps_display("");
-
-       /*
-        * Arrange to write the initial status file right away
-        */
-       need_statwrite = true;
-
-       /* Preset the delay between status file writes */
-       MemSet(&timeout, 0, sizeof(struct itimerval));
-       timeout.it_value.tv_sec = PGSTAT_STAT_INTERVAL / 1000;
-       timeout.it_value.tv_usec = PGSTAT_STAT_INTERVAL % 1000;
-
-       /*
-        * Read in an existing statistics stats file or initialize the stats to
-        * zero.
-        */
-       pgStatRunningInCollector = true;
-       pgstat_read_statsfile(&pgStatDBHash, InvalidOid);
-
-       readPipe = pgStatPipe[0];
+       PgBackendStatus *beentry;
+       int                     i;
 
-       /*
-        * Process incoming messages and handle all the reporting stuff until
-        * there are no more messages.
-        */
-       for (;;)
+       beentry = BackendStatusArray;
+       for (i = 1; i <= MaxBackends; i++)
        {
                /*
-                * If time to write the stats file, do so.  Note that the alarm
-                * interrupt isn't re-enabled immediately, but only after we next
-                * receive a stats message; so no cycles are wasted when there is
-                * nothing going on.
+                * Although we expect the target backend's entry to be stable, that
+                * doesn't imply that anyone else's is.  To avoid identifying the
+                * wrong backend, while we check for a match to the desired PID we
+                * must follow the protocol of retrying if st_changecount changes
+                * while we examine the entry, or if it's odd.  (This might be
+                * unnecessary, since fetching or storing an int is almost certainly
+                * atomic, but let's play it safe.)  We use a volatile pointer here to
+                * ensure the compiler doesn't try to get cute.
                 */
-               if (need_statwrite)
+               volatile PgBackendStatus *vbeentry = beentry;
+               bool            found;
+
+               for (;;)
                {
-                       pgstat_write_statsfile();
-                       need_statwrite = false;
-                       need_timer = true;
-               }
+                       int                     save_changecount = vbeentry->st_changecount;
 
-               /*
-                * Setup the descriptor set for select(2)
-                */
-               FD_ZERO(&rfds);
-               FD_SET(readPipe, &rfds);
+                       found = (vbeentry->st_procpid == pid);
 
-               /*
-                * Now wait for something to do.
-                */
-               if (select(readPipe + 1, &rfds, NULL, NULL, NULL) < 0)
-               {
-                       if (errno == EINTR)
-                               continue;
-                       ereport(ERROR,
-                                       (errcode_for_socket_access(),
-                                        errmsg("select() failed in statistics collector: %m")));
+                       if (save_changecount == vbeentry->st_changecount &&
+                               (save_changecount & 1) == 0)
+                               break;
+
+                       /* Make sure we can break out of loop if stuck... */
+                       CHECK_FOR_INTERRUPTS();
                }
 
-               /*
-                * Check if there is a new statistics message to collect.
-                */
-               if (FD_ISSET(readPipe, &rfds))
+               if (found)
                {
-                       /*
-                        * We may need to issue multiple read calls in case the buffer
-                        * process didn't write the message in a single write, which is
-                        * possible since it dumps its buffer bytewise. In any case, we'd
-                        * need two reads since we don't know the message length
-                        * initially.
-                        */
-                       int                     nread = 0;
-                       int                     targetlen = sizeof(PgStat_MsgHdr);              /* initial */
-                       bool            pipeEOF = false;
+                       /* Now it is safe to use the non-volatile pointer */
+                       if (checkUser && !superuser() && beentry->st_userid != GetUserId())
+                               return "<insufficient privilege>";
+                       else if (*(beentry->st_activity) == '\0')
+                               return "<command string not enabled>";
+                       else
+                               return beentry->st_activity;
+               }
 
-                       while (nread < targetlen)
-                       {
-                               len = piperead(readPipe, ((char *) &msg) + nread,
-                                                          targetlen - nread);
-                               if (len < 0)
-                               {
-                                       if (errno == EINTR)
-                                               continue;
-                                       ereport(ERROR,
-                                                       (errcode_for_socket_access(),
-                                                        errmsg("could not read from statistics collector pipe: %m")));
-                               }
-                               if (len == 0)   /* EOF on the pipe! */
-                               {
-                                       pipeEOF = true;
-                                       break;
-                               }
-                               nread += len;
-                               if (nread == sizeof(PgStat_MsgHdr))
-                               {
-                                       /* we have the header, compute actual msg length */
-                                       targetlen = msg.msg_hdr.m_size;
-                                       if (targetlen < (int) sizeof(PgStat_MsgHdr) ||
-                                               targetlen > (int) sizeof(msg))
-                                       {
-                                               /*
-                                                * Bogus message length implies that we got out of
-                                                * sync with the buffer process somehow. Abort so that
-                                                * we can restart both processes.
-                                                */
-                                               ereport(ERROR,
-                                                         (errmsg("invalid statistics message length")));
-                                       }
-                               }
-                       }
+               beentry++;
+       }
 
-                       /*
-                        * EOF on the pipe implies that the buffer process exited. Fall
-                        * out of outer loop.
-                        */
-                       if (pipeEOF)
-                               break;
+       /* If we get here, caller is in error ... */
+       return "<backend information not available>";
+}
 
-                       /*
-                        * Distribute the message to the specific function handling it.
-                        */
-                       switch (msg.msg_hdr.m_type)
-                       {
-                               case PGSTAT_MTYPE_DUMMY:
-                                       break;
 
-                               case PGSTAT_MTYPE_TABSTAT:
-                                       pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, nread);
-                                       break;
+/* ------------------------------------------------------------
+ * Local support functions follow
+ * ------------------------------------------------------------
+ */
 
-                               case PGSTAT_MTYPE_TABPURGE:
-                                       pgstat_recv_tabpurge((PgStat_MsgTabpurge *) &msg, nread);
-                                       break;
 
-                               case PGSTAT_MTYPE_DROPDB:
-                                       pgstat_recv_dropdb((PgStat_MsgDropdb *) &msg, nread);
-                                       break;
+/* ----------
+ * pgstat_setheader() -
+ *
+ *             Set common header fields in a statistics message
+ * ----------
+ */
+static void
+pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype)
+{
+       hdr->m_type = mtype;
+}
 
-                               case PGSTAT_MTYPE_RESETCOUNTER:
-                                       pgstat_recv_resetcounter((PgStat_MsgResetcounter *) &msg,
-                                                                                        nread);
-                                       break;
 
-                               case PGSTAT_MTYPE_AUTOVAC_START:
-                                       pgstat_recv_autovac((PgStat_MsgAutovacStart *) &msg, nread);
-                                       break;
+/* ----------
+ * pgstat_send() -
+ *
+ *             Send out one statistics message to the collector
+ * ----------
+ */
+static void
+pgstat_send(void *msg, int len)
+{
+       int                     rc;
 
-                               case PGSTAT_MTYPE_VACUUM:
-                                       pgstat_recv_vacuum((PgStat_MsgVacuum *) &msg, nread);
-                                       break;
+       if (pgStatSock == PGINVALID_SOCKET)
+               return;
 
-                               case PGSTAT_MTYPE_ANALYZE:
-                                       pgstat_recv_analyze((PgStat_MsgAnalyze *) &msg, nread);
-                                       break;
+       ((PgStat_MsgHdr *) msg)->m_size = len;
 
-                               default:
-                                       break;
-                       }
+       /* We'll retry after EINTR, but ignore all other failures */
+       do
+       {
+               rc = send(pgStatSock, msg, len, 0);
+       } while (rc < 0 && errno == EINTR);
 
-                       /*
-                        * If this is the first message after we wrote the stats file the
-                        * last time, enable the alarm interrupt to make it be written
-                        * again later.
-                        */
-                       if (need_timer)
-                       {
-                               if (setitimer(ITIMER_REAL, &timeout, NULL))
-                                       ereport(ERROR,
-                                                 (errmsg("could not set statistics collector timer: %m")));
-                               need_timer = false;
-                       }
-               }
+#ifdef USE_ASSERT_CHECKING
+       /* In debug builds, log send failures ... */
+       if (rc < 0)
+               elog(LOG, "could not send to statistics collector: %m");
+#endif
+}
 
-               /*
-                * Note that we do NOT check for postmaster exit inside the loop; only
-                * EOF on the buffer pipe causes us to fall out.  This ensures we
-                * don't exit prematurely if there are still a few messages in the
-                * buffer or pipe at postmaster shutdown.
-                */
-       }
+/* ----------
+ * pgstat_send_bgwriter() -
+ *
+ *             Send bgwriter statistics to the collector
+ * ----------
+ */
+void
+pgstat_send_bgwriter(void)
+{
+       /* We assume this initializes to zeroes */
+       static const PgStat_MsgBgWriter all_zeroes;
 
        /*
-        * Okay, we saw EOF on the buffer pipe, so there are no more messages to
-        * process.  If the buffer process quit because of postmaster shutdown, we
-        * want to save the final stats to reuse at next startup. But if the
-        * buffer process failed, it seems best not to (there may even now be a
-        * new collector firing up, and we don't want it to read a
-        * partially-rewritten stats file).
+        * This function can be called even if nothing at all has happened. In
+        * this case, avoid sending a completely empty message to the stats
+        * collector.
         */
-       if (!PostmasterIsAlive(false))
-               pgstat_write_statsfile();
-}
+       if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
+               return;
 
+       /*
+        * Prepare and send the message
+        */
+       pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
+       pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
 
-/* SIGALRM signal handler for collector process */
-static void
-force_statwrite(SIGNAL_ARGS)
-{
-       need_statwrite = true;
+       /*
+        * Clear out the statistics buffer, so it can be re-used.
+        */
+       MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
 }
 
 
 /* ----------
- * pgstat_recvbuffer() -
+ * PgstatCollectorMain() -
+ *
+ *     Start up the statistics collector process.      This is the body of the
+ *     postmaster child process.
  *
- *     This is the body of the separate buffering process. Its only
- *     purpose is to receive messages from the UDP socket as fast as
- *     possible and forward them over a pipe into the collector itself.
- *     If the collector is slow to absorb messages, they are buffered here.
+ *     The argc/argv parameters are valid only in EXEC_BACKEND case.
  * ----------
  */
-static void
-pgstat_recvbuffer(void)
+NON_EXEC_STATIC void
+PgstatCollectorMain(int argc, char *argv[])
 {
-       fd_set          rfds;
-       fd_set          wfds;
-       struct timeval timeout;
-       int                     writePipe = pgStatPipe[1];
-       int                     maxfd;
        int                     len;
-       int                     xfr;
-       int                     frm;
-       PgStat_Msg      input_buffer;
-       char       *msgbuffer;
-       int                     msg_send = 0;   /* next send index in buffer */
-       int                     msg_recv = 0;   /* next receive index */
-       int                     msg_have = 0;   /* number of bytes stored */
-       bool            overflow = false;
+       PgStat_Msg      msg;
+
+#ifndef WIN32
+#ifdef HAVE_POLL
+       struct pollfd input_fd;
+#else
+       struct timeval sel_timeout;
+       fd_set          rfds;
+#endif
+#endif
+
+       IsUnderPostmaster = true;       /* we are a postmaster subprocess now */
+
+       MyProcPid = getpid();           /* reset MyProcPid */
+
+       MyStartTime = time(NULL);       /* record Start Time for logging */
 
        /*
-        * Identify myself via ps
+        * If possible, make this process a group leader, so that the postmaster
+        * can signal any child processes too.  (pgstat probably never has any
+        * child processes, but for consistency we make all postmaster child
+        * processes do this.)
         */
-       init_ps_display("stats buffer process", "", "");
-       set_ps_display("");
+#ifdef HAVE_SETSID
+       if (setsid() < 0)
+               elog(FATAL, "setsid() failed: %m");
+#endif
 
        /*
-        * We want to die if our child collector process does.  There are two ways
-        * we might notice that it has died: receive SIGCHLD, or get a write
-        * failure on the pipe leading to the child.  We can set SIGPIPE to kill
-        * us here.  Our SIGCHLD handler was already set up before we forked (must
-        * do it that way, else it's a race condition).
+        * Ignore all signals usually bound to some action in the postmaster,
+        * except SIGQUIT.
         */
-       pqsignal(SIGPIPE, SIG_DFL);
+       pqsignal(SIGHUP, pgstat_sighup_handler);
+       pqsignal(SIGINT, SIG_IGN);
+       pqsignal(SIGTERM, SIG_IGN);
+       pqsignal(SIGQUIT, pgstat_exit);
+       pqsignal(SIGALRM, SIG_IGN);
+       pqsignal(SIGPIPE, SIG_IGN);
+       pqsignal(SIGUSR1, SIG_IGN);
+       pqsignal(SIGUSR2, SIG_IGN);
+       pqsignal(SIGCHLD, SIG_DFL);
+       pqsignal(SIGTTIN, SIG_DFL);
+       pqsignal(SIGTTOU, SIG_DFL);
+       pqsignal(SIGCONT, SIG_DFL);
+       pqsignal(SIGWINCH, SIG_DFL);
        PG_SETMASK(&UnBlockSig);
 
        /*
-        * Set the write pipe to nonblock mode, so that we cannot block when the
-        * collector falls behind.
+        * Identify myself via ps
         */
-       if (!pg_set_noblock(writePipe))
-               ereport(ERROR,
-                               (errcode_for_socket_access(),
-                                errmsg("could not set statistics collector pipe to nonblocking mode: %m")));
+       init_ps_display("stats collector process", "", "", "");
+
+       /*
+        * Arrange to write the initial status file right away
+        */
+       last_statrequest = GetCurrentTimestamp();
+       last_statwrite = last_statrequest - 1;
+
+       /*
+        * Read in an existing statistics stats file or initialize the stats to
+        * zero.
+        */
+       pgStatRunningInCollector = true;
+       pgStatDBHash = pgstat_read_statsfile(InvalidOid, true);
 
        /*
-        * Allocate the message buffer
+        * Setup the descriptor set for select(2).      Since only one bit in the set
+        * ever changes, we need not repeat FD_ZERO each time.
         */
-       msgbuffer = (char *) palloc(PGSTAT_RECVBUFFERSZ);
+#if !defined(HAVE_POLL) && !defined(WIN32)
+       FD_ZERO(&rfds);
+#endif
 
        /*
-        * Loop forever
+        * Loop to process messages until we get SIGQUIT or detect ungraceful
+        * death of our parent postmaster.
+        *
+        * For performance reasons, we don't want to do a PostmasterIsAlive() test
+        * after every message; instead, do it only when select()/poll() is
+        * interrupted by timeout.      In essence, we'll stay alive as long as
+        * backends keep sending us stuff often, even if the postmaster is gone.
         */
        for (;;)
        {
-               FD_ZERO(&rfds);
-               FD_ZERO(&wfds);
-               maxfd = -1;
+               int                     got_data;
 
                /*
-                * As long as we have buffer space we add the socket to the read
-                * descriptor set.
+                * Quit if we get SIGQUIT from the postmaster.
                 */
-               if (msg_have <= (int) (PGSTAT_RECVBUFFERSZ - sizeof(PgStat_Msg)))
-               {
-                       FD_SET(pgStatSock, &rfds);
-                       maxfd = pgStatSock;
-                       overflow = false;
-               }
-               else
+               if (need_exit)
+                       break;
+
+               /*
+                * Reload configuration if we got SIGHUP from the postmaster.
+                */
+               if (got_SIGHUP)
                {
-                       if (!overflow)
-                       {
-                               ereport(LOG,
-                                               (errmsg("statistics buffer is full")));
-                               overflow = true;
-                       }
+                       ProcessConfigFile(PGC_SIGHUP);
+                       got_SIGHUP = false;
                }
 
                /*
-                * If we have messages to write out, we add the pipe to the write
-                * descriptor set.
+                * Write the stats file if a new request has arrived that is not
+                * satisfied by existing file.
+                */
+               if (last_statwrite < last_statrequest)
+                       pgstat_write_statsfile(false);
+
+               /*
+                * Wait for a message to arrive; but not for more than
+                * PGSTAT_SELECT_TIMEOUT seconds. (This determines how quickly we will
+                * shut down after an ungraceful postmaster termination; so it needn't
+                * be very fast.  However, on some systems SIGQUIT won't interrupt the
+                * poll/select call, so this also limits speed of response to SIGQUIT,
+                * which is more important.)
+                *
+                * We use poll(2) if available, otherwise select(2). Win32 has its own
+                * implementation.
                 */
-               if (msg_have > 0)
+#ifndef WIN32
+#ifdef HAVE_POLL
+               input_fd.fd = pgStatSock;
+               input_fd.events = POLLIN | POLLERR;
+               input_fd.revents = 0;
+
+               if (poll(&input_fd, 1, PGSTAT_SELECT_TIMEOUT * 1000) < 0)
                {
-                       FD_SET(writePipe, &wfds);
-                       if (writePipe > maxfd)
-                               maxfd = writePipe;
+                       if (errno == EINTR)
+                               continue;
+                       ereport(ERROR,
+                                       (errcode_for_socket_access(),
+                                        errmsg("poll() failed in statistics collector: %m")));
                }
 
+               got_data = (input_fd.revents != 0);
+#else                                                  /* !HAVE_POLL */
+
+               FD_SET(pgStatSock, &rfds);
+
                /*
-                * Wait for some work to do; but not for more than 10 seconds. (This
-                * determines how quickly we will shut down after an ungraceful
-                * postmaster termination; so it needn't be very fast.)
-                *
-                * struct timeout is modified by select() on some operating systems,
+                * timeout struct is modified by select() on some operating systems,
                 * so re-fill it each time.
                 */
-               timeout.tv_sec = 10;
-               timeout.tv_usec = 0;
+               sel_timeout.tv_sec = PGSTAT_SELECT_TIMEOUT;
+               sel_timeout.tv_usec = 0;
 
-               if (select(maxfd + 1, &rfds, &wfds, NULL, &timeout) < 0)
+               if (select(pgStatSock + 1, &rfds, NULL, NULL, &sel_timeout) < 0)
                {
                        if (errno == EINTR)
                                continue;
                        ereport(ERROR,
                                        (errcode_for_socket_access(),
-                                        errmsg("select() failed in statistics buffer: %m")));
+                                        errmsg("select() failed in statistics collector: %m")));
                }
 
+               got_data = FD_ISSET(pgStatSock, &rfds);
+#endif   /* HAVE_POLL */
+#else                                                  /* WIN32 */
+               got_data = pgwin32_waitforsinglesocket(pgStatSock, FD_READ,
+                                                                                          PGSTAT_SELECT_TIMEOUT * 1000);
+#endif
+
                /*
                 * If there is a message on the socket, read it and check for
                 * validity.
                 */
-               if (FD_ISSET(pgStatSock, &rfds))
+               if (got_data)
                {
-                       len = recv(pgStatSock, (char *) &input_buffer,
+                       len = recv(pgStatSock, (char *) &msg,
                                           sizeof(PgStat_Msg), 0);
                        if (len < 0)
+                       {
+                               if (errno == EINTR)
+                                       continue;
                                ereport(ERROR,
                                                (errcode_for_socket_access(),
                                                 errmsg("could not read statistics message: %m")));
+                       }
 
                        /*
                         * We ignore messages that are smaller than our common header
@@ -2084,110 +3029,114 @@ pgstat_recvbuffer(void)
                        /*
                         * The received length must match the length in the header
                         */
-                       if (input_buffer.msg_hdr.m_size != len)
+                       if (msg.msg_hdr.m_size != len)
                                continue;
 
                        /*
-                        * O.K. - we accept this message.  Copy it to the circular
-                        * msgbuffer.
+                        * O.K. - we accept this message.  Process it.
                         */
-                       frm = 0;
-                       while (len > 0)
+                       switch (msg.msg_hdr.m_type)
                        {
-                               xfr = PGSTAT_RECVBUFFERSZ - msg_recv;
-                               if (xfr > len)
-                                       xfr = len;
-                               Assert(xfr > 0);
-                               memcpy(msgbuffer + msg_recv,
-                                          ((char *) &input_buffer) + frm,
-                                          xfr);
-                               msg_recv += xfr;
-                               if (msg_recv == PGSTAT_RECVBUFFERSZ)
-                                       msg_recv = 0;
-                               msg_have += xfr;
-                               frm += xfr;
-                               len -= xfr;
+                               case PGSTAT_MTYPE_DUMMY:
+                                       break;
+
+                               case PGSTAT_MTYPE_INQUIRY:
+                                       pgstat_recv_inquiry((PgStat_MsgInquiry *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_TABSTAT:
+                                       pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_TABPURGE:
+                                       pgstat_recv_tabpurge((PgStat_MsgTabpurge *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_DROPDB:
+                                       pgstat_recv_dropdb((PgStat_MsgDropdb *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_RESETCOUNTER:
+                                       pgstat_recv_resetcounter((PgStat_MsgResetcounter *) &msg,
+                                                                                        len);
+                                       break;
+
+                               case PGSTAT_MTYPE_RESETSHAREDCOUNTER:
+                                       pgstat_recv_resetsharedcounter(
+                                                                          (PgStat_MsgResetsharedcounter *) &msg,
+                                                                                                  len);
+                                       break;
+
+                               case PGSTAT_MTYPE_RESETSINGLECOUNTER:
+                                       pgstat_recv_resetsinglecounter(
+                                                                          (PgStat_MsgResetsinglecounter *) &msg,
+                                                                                                  len);
+                                       break;
+
+                               case PGSTAT_MTYPE_AUTOVAC_START:
+                                       pgstat_recv_autovac((PgStat_MsgAutovacStart *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_VACUUM:
+                                       pgstat_recv_vacuum((PgStat_MsgVacuum *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_ANALYZE:
+                                       pgstat_recv_analyze((PgStat_MsgAnalyze *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_BGWRITER:
+                                       pgstat_recv_bgwriter((PgStat_MsgBgWriter *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_FUNCSTAT:
+                                       pgstat_recv_funcstat((PgStat_MsgFuncstat *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_FUNCPURGE:
+                                       pgstat_recv_funcpurge((PgStat_MsgFuncpurge *) &msg, len);
+                                       break;
+
+                               case PGSTAT_MTYPE_RECOVERYCONFLICT:
+                                       pgstat_recv_recoveryconflict((PgStat_MsgRecoveryConflict *) &msg, len);
+                                       break;
+
+                               default:
+                                       break;
                        }
                }
-
-               /*
-                * If the collector is ready to receive, write some data into his
-                * pipe.  We may or may not be able to write all that we have.
-                *
-                * NOTE: if what we have is less than PIPE_BUF bytes but more than the
-                * space available in the pipe buffer, most kernels will refuse to
-                * write any of it, and will return EAGAIN.  This means we will
-                * busy-loop until the situation changes (either because the collector
-                * caught up, or because more data arrives so that we have more than
-                * PIPE_BUF bytes buffered).  This is not good, but is there any way
-                * around it?  We have no way to tell when the collector has caught
-                * up...
-                */
-               if (FD_ISSET(writePipe, &wfds))
+               else
                {
-                       xfr = PGSTAT_RECVBUFFERSZ - msg_send;
-                       if (xfr > msg_have)
-                               xfr = msg_have;
-                       Assert(xfr > 0);
-                       len = pipewrite(writePipe, msgbuffer + msg_send, xfr);
-                       if (len < 0)
-                       {
-                               if (errno == EINTR || errno == EAGAIN)
-                                       continue;       /* not enough space in pipe */
-                               ereport(ERROR,
-                                               (errcode_for_socket_access(),
-                               errmsg("could not write to statistics collector pipe: %m")));
-                       }
-                       /* NB: len < xfr is okay */
-                       msg_send += len;
-                       if (msg_send == PGSTAT_RECVBUFFERSZ)
-                               msg_send = 0;
-                       msg_have -= len;
+                       /*
+                        * We can only get here if the select/poll timeout elapsed. Check
+                        * for postmaster death.
+                        */
+                       if (!PostmasterIsAlive(true))
+                               break;
                }
+       }                                                       /* end of message-processing loop */
 
-               /*
-                * Make sure we forwarded all messages before we check for postmaster
-                * termination.
-                */
-               if (msg_have != 0 || FD_ISSET(pgStatSock, &rfds))
-                       continue;
+       /*
+        * Save the final stats to reuse at next startup.
+        */
+       pgstat_write_statsfile(true);
 
-               /*
-                * If the postmaster has terminated, we die too.  (This is no longer
-                * the normal exit path, however.)
-                */
-               if (!PostmasterIsAlive(true))
-                       exit(0);
-       }
+       exit(0);
 }
 
-/* SIGQUIT signal handler for buffer process */
+
+/* SIGQUIT signal handler for collector process */
 static void
 pgstat_exit(SIGNAL_ARGS)
 {
-       /*
-        * For now, we just nail the doors shut and get out of town.  It might be
-        * cleaner to allow any pending messages to be sent, but that creates a
-        * tradeoff against speed of exit.
-        */
-
-       /*
-        * If running in bufferer, kill our collector as well. On some broken
-        * win32 systems, it does not shut down automatically because of issues
-        * with socket inheritance.  XXX so why not fix the socket inheritance...
-        */
-#ifdef WIN32
-       if (pgStatCollectorPid > 0)
-               kill(pgStatCollectorPid, SIGQUIT);
-#endif
-       exit(0);
+       need_exit = true;
 }
 
-/* SIGCHLD signal handler for buffer process */
+/* SIGHUP handler for collector process */
 static void
-pgstat_die(SIGNAL_ARGS)
+pgstat_sighup_handler(SIGNAL_ARGS)
 {
-       exit(1);
+       got_SIGHUP = true;
 }
 
 
@@ -2217,11 +3166,24 @@ pgstat_get_db_entry(Oid databaseid, bool create)
                HASHCTL         hash_ctl;
 
                result->tables = NULL;
+               result->functions = NULL;
                result->n_xact_commit = 0;
                result->n_xact_rollback = 0;
                result->n_blocks_fetched = 0;
                result->n_blocks_hit = 0;
+               result->n_tuples_returned = 0;
+               result->n_tuples_fetched = 0;
+               result->n_tuples_inserted = 0;
+               result->n_tuples_updated = 0;
+               result->n_tuples_deleted = 0;
                result->last_autovac_time = 0;
+               result->n_conflict_tablespace = 0;
+               result->n_conflict_lock = 0;
+               result->n_conflict_snapshot = 0;
+               result->n_conflict_bufferpin = 0;
+               result->n_conflict_startup_deadlock = 0;
+
+               result->stat_reset_timestamp = GetCurrentTimestamp();
 
                memset(&hash_ctl, 0, sizeof(hash_ctl));
                hash_ctl.keysize = sizeof(Oid);
@@ -2231,6 +3193,63 @@ pgstat_get_db_entry(Oid databaseid, bool create)
                                                                         PGSTAT_TAB_HASH_SIZE,
                                                                         &hash_ctl,
                                                                         HASH_ELEM | HASH_FUNCTION);
+
+               hash_ctl.keysize = sizeof(Oid);
+               hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
+               hash_ctl.hash = oid_hash;
+               result->functions = hash_create("Per-database function",
+                                                                               PGSTAT_FUNCTION_HASH_SIZE,
+                                                                               &hash_ctl,
+                                                                               HASH_ELEM | HASH_FUNCTION);
+       }
+
+       return result;
+}
+
+
+/*
+ * Lookup the hash table entry for the specified table. If no hash
+ * table entry exists, initialize it, if the create parameter is true.
+ * Else, return NULL.
+ */
+static PgStat_StatTabEntry *
+pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
+{
+       PgStat_StatTabEntry *result;
+       bool            found;
+       HASHACTION      action = (create ? HASH_ENTER : HASH_FIND);
+
+       /* Lookup or create the hash table entry for this table */
+       result = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
+                                                                                                &tableoid,
+                                                                                                action, &found);
+
+       if (!create && !found)
+               return NULL;
+
+       /* If not found, initialize the new one. */
+       if (!found)
+       {
+               result->numscans = 0;
+               result->tuples_returned = 0;
+               result->tuples_fetched = 0;
+               result->tuples_inserted = 0;
+               result->tuples_updated = 0;
+               result->tuples_deleted = 0;
+               result->tuples_hot_updated = 0;
+               result->n_live_tuples = 0;
+               result->n_dead_tuples = 0;
+               result->changes_since_analyze = 0;
+               result->blocks_fetched = 0;
+               result->blocks_hit = 0;
+               result->vacuum_timestamp = 0;
+               result->vacuum_count = 0;
+               result->autovac_vacuum_timestamp = 0;
+               result->autovac_vacuum_count = 0;
+               result->analyze_timestamp = 0;
+               result->analyze_count = 0;
+               result->autovac_analyze_timestamp = 0;
+               result->autovac_analyze_count = 0;
        }
 
        return result;
@@ -2241,37 +3260,55 @@ pgstat_get_db_entry(Oid databaseid, bool create)
  * pgstat_write_statsfile() -
  *
  *     Tell the news.
+ *     If writing to the permanent file (happens when the collector is
+ *     shutting down only), remove the temporary file so that backends
+ *     starting up under a new postmaster can't read the old data before
+ *     the new collector is ready.
  * ----------
  */
 static void
-pgstat_write_statsfile(void)
+pgstat_write_statsfile(bool permanent)
 {
        HASH_SEQ_STATUS hstat;
        HASH_SEQ_STATUS tstat;
+       HASH_SEQ_STATUS fstat;
        PgStat_StatDBEntry *dbentry;
        PgStat_StatTabEntry *tabentry;
+       PgStat_StatFuncEntry *funcentry;
        FILE       *fpout;
        int32           format_id;
+       const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname;
+       const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
 
        /*
         * Open the statistics temp file to write out the current values.
         */
-       fpout = fopen(PGSTAT_STAT_TMPFILE, PG_BINARY_W);
+       fpout = AllocateFile(tmpfile, PG_BINARY_W);
        if (fpout == NULL)
        {
                ereport(LOG,
                                (errcode_for_file_access(),
                                 errmsg("could not open temporary statistics file \"%s\": %m",
-                                               PGSTAT_STAT_TMPFILE)));
+                                               tmpfile)));
                return;
        }
 
+       /*
+        * Set the timestamp of the stats file.
+        */
+       globalStats.stats_timestamp = GetCurrentTimestamp();
+
        /*
         * Write the file header --- currently just a format ID.
         */
        format_id = PGSTAT_FILE_FORMAT_ID;
        fwrite(&format_id, sizeof(format_id), 1, fpout);
 
+       /*
+        * Write global stats struct
+        */
+       fwrite(&globalStats, sizeof(globalStats), 1, fpout);
+
        /*
         * Walk through the database table.
         */
@@ -2279,9 +3316,9 @@ pgstat_write_statsfile(void)
        while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
        {
                /*
-                * Write out the DB entry including the number of live backends.
-                * We don't write the tables pointer since it's of no use to any
-                * other process.
+                * Write out the DB entry including the number of live backends. We
+                * don't write the tables or functions pointers, since they're of no
+                * use to any other process.
                 */
                fputc('D', fpout);
                fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
@@ -2296,6 +3333,16 @@ pgstat_write_statsfile(void)
                        fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
                }
 
+               /*
+                * Walk through the database's function stats table.
+                */
+               hash_seq_init(&fstat, dbentry->functions);
+               while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
+               {
+                       fputc('F', fpout);
+                       fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
+               }
+
                /*
                 * Mark the end of this DB
                 */
@@ -2313,27 +3360,59 @@ pgstat_write_statsfile(void)
        {
                ereport(LOG,
                                (errcode_for_file_access(),
-                                errmsg("could not write temporary statistics file \"%s\": %m",
-                                               PGSTAT_STAT_TMPFILE)));
-               fclose(fpout);
-               unlink(PGSTAT_STAT_TMPFILE);
+                          errmsg("could not write temporary statistics file \"%s\": %m",
+                                         tmpfile)));
+               FreeFile(fpout);
+               unlink(tmpfile);
        }
-       else if (fclose(fpout) < 0)
+       else if (FreeFile(fpout) < 0)
        {
                ereport(LOG,
                                (errcode_for_file_access(),
                           errmsg("could not close temporary statistics file \"%s\": %m",
-                                         PGSTAT_STAT_TMPFILE)));
-               unlink(PGSTAT_STAT_TMPFILE);
+                                         tmpfile)));
+               unlink(tmpfile);
        }
-       else if (rename(PGSTAT_STAT_TMPFILE, PGSTAT_STAT_FILENAME) < 0)
+       else if (rename(tmpfile, statfile) < 0)
        {
                ereport(LOG,
                                (errcode_for_file_access(),
                                 errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
-                                               PGSTAT_STAT_TMPFILE, PGSTAT_STAT_FILENAME)));
-               unlink(PGSTAT_STAT_TMPFILE);
+                                               tmpfile, statfile)));
+               unlink(tmpfile);
+       }
+       else
+       {
+               /*
+                * Successful write, so update last_statwrite.
+                */
+               last_statwrite = globalStats.stats_timestamp;
+
+               /*
+                * If there is clock skew between backends and the collector, we could
+                * receive a stats request time that's in the future.  If so, complain
+                * and reset last_statrequest.  Resetting ensures that no inquiry
+                * message can cause more than one stats file write to occur.
+                */
+               if (last_statrequest > last_statwrite)
+               {
+                       char       *reqtime;
+                       char       *mytime;
+
+                       /* Copy because timestamptz_to_str returns a static buffer */
+                       reqtime = pstrdup(timestamptz_to_str(last_statrequest));
+                       mytime = pstrdup(timestamptz_to_str(last_statwrite));
+                       elog(LOG, "last_statrequest %s is later than collector's time %s",
+                                reqtime, mytime);
+                       pfree(reqtime);
+                       pfree(mytime);
+
+                       last_statrequest = last_statwrite;
+               }
        }
+
+       if (permanent)
+               unlink(pgstat_stat_filename);
 }
 
 
@@ -2344,38 +3423,28 @@ pgstat_write_statsfile(void)
  *     databases' hash table (whose entries point to the tables' hash tables).
  * ----------
  */
-static void
-pgstat_read_statsfile(HTAB **dbhash, Oid onlydb)
+static HTAB *
+pgstat_read_statsfile(Oid onlydb, bool permanent)
 {
        PgStat_StatDBEntry *dbentry;
        PgStat_StatDBEntry dbbuf;
        PgStat_StatTabEntry *tabentry;
        PgStat_StatTabEntry tabbuf;
+       PgStat_StatFuncEntry funcbuf;
+       PgStat_StatFuncEntry *funcentry;
        HASHCTL         hash_ctl;
+       HTAB       *dbhash;
        HTAB       *tabhash = NULL;
+       HTAB       *funchash = NULL;
        FILE       *fpin;
        int32           format_id;
        bool            found;
-       MemoryContext use_mcxt;
-       int                     mcxt_flags;
+       const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
 
        /*
-        * If running in the collector or the autovacuum process, we use the
-        * DynaHashCxt memory context.  If running in a backend, we use the
-        * TopTransactionContext instead, so the caller must only know the last
-        * XactId when this call happened to know if his tables are still valid or
-        * already gone!
+        * The tables will live in pgStatLocalContext.
         */
-       if (pgStatRunningInCollector || IsAutoVacuumProcess())
-       {
-               use_mcxt = NULL;
-               mcxt_flags = 0;
-       }
-       else
-       {
-               use_mcxt = TopTransactionContext;
-               mcxt_flags = HASH_CONTEXT;
-       }
+       pgstat_setup_memcxt();
 
        /*
         * Create the DB hashtable
@@ -2384,17 +3453,40 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb)
        hash_ctl.keysize = sizeof(Oid);
        hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
        hash_ctl.hash = oid_hash;
-       hash_ctl.hcxt = use_mcxt;
-       *dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
-                                                 HASH_ELEM | HASH_FUNCTION | mcxt_flags);
+       hash_ctl.hcxt = pgStatLocalContext;
+       dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
+                                                HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+       /*
+        * Clear out global statistics so they start from zero in case we can't
+        * load an existing statsfile.
+        */
+       memset(&globalStats, 0, sizeof(globalStats));
+
+       /*
+        * Set the current timestamp (will be kept only in case we can't load an
+        * existing statsfile.
+        */
+       globalStats.stat_reset_timestamp = GetCurrentTimestamp();
 
        /*
         * Try to open the status file. If it doesn't exist, the backends simply
         * return zero for anything and the collector simply starts from scratch
         * with empty counters.
+        *
+        * ENOENT is a possibility if the stats collector is not running or has
+        * not yet written the stats file the first time.  Any other failure
+        * condition is suspicious.
         */
-       if ((fpin = AllocateFile(PGSTAT_STAT_FILENAME, PG_BINARY_R)) == NULL)
-               return;
+       if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
+       {
+               if (errno != ENOENT)
+                       ereport(pgStatRunningInCollector ? LOG : WARNING,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not open statistics file \"%s\": %m",
+                                                       statfile)));
+               return dbhash;
+       }
 
        /*
         * Verify it's of the expected format.
@@ -2403,7 +3495,17 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb)
                || format_id != PGSTAT_FILE_FORMAT_ID)
        {
                ereport(pgStatRunningInCollector ? LOG : WARNING,
-                               (errmsg("corrupted pgstat.stat file")));
+                               (errmsg("corrupted statistics file \"%s\"", statfile)));
+               goto done;
+       }
+
+       /*
+        * Read global stats struct
+        */
+       if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
+       {
+               ereport(pgStatRunningInCollector ? LOG : WARNING,
+                               (errmsg("corrupted statistics file \"%s\"", statfile)));
                goto done;
        }
 
@@ -2417,34 +3519,37 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb)
                {
                                /*
                                 * 'D'  A PgStat_StatDBEntry struct describing a database
-                                * follows. Subsequently, zero to many 'T' entries will follow
-                                * until a 'd' is encountered.
+                                * follows. Subsequently, zero to many 'T' and 'F' entries
+                                * will follow until a 'd' is encountered.
                                 */
                        case 'D':
                                if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables),
                                                  fpin) != offsetof(PgStat_StatDBEntry, tables))
                                {
                                        ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                                       (errmsg("corrupted pgstat.stat file")));
+                                                       (errmsg("corrupted statistics file \"%s\"",
+                                                                       statfile)));
                                        goto done;
                                }
 
                                /*
                                 * Add to the DB hash
                                 */
-                               dbentry = (PgStat_StatDBEntry *) hash_search(*dbhash,
+                               dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
                                                                                                  (void *) &dbbuf.databaseid,
                                                                                                                         HASH_ENTER,
                                                                                                                         &found);
                                if (found)
                                {
                                        ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                                       (errmsg("corrupted pgstat.stat file")));
+                                                       (errmsg("corrupted statistics file \"%s\"",
+                                                                       statfile)));
                                        goto done;
                                }
 
                                memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
                                dbentry->tables = NULL;
+                               dbentry->functions = NULL;
 
                                /*
                                 * Don't collect tables if not the requested DB (or the
@@ -2461,17 +3566,27 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb)
                                hash_ctl.keysize = sizeof(Oid);
                                hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
                                hash_ctl.hash = oid_hash;
-                               hash_ctl.hcxt = use_mcxt;
+                               hash_ctl.hcxt = pgStatLocalContext;
                                dbentry->tables = hash_create("Per-database table",
                                                                                          PGSTAT_TAB_HASH_SIZE,
                                                                                          &hash_ctl,
-                                                                        HASH_ELEM | HASH_FUNCTION | mcxt_flags);
+                                                                  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+                               hash_ctl.keysize = sizeof(Oid);
+                               hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
+                               hash_ctl.hash = oid_hash;
+                               hash_ctl.hcxt = pgStatLocalContext;
+                               dbentry->functions = hash_create("Per-database function",
+                                                                                                PGSTAT_FUNCTION_HASH_SIZE,
+                                                                                                &hash_ctl,
+                                                                  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 
                                /*
-                                * Arrange that following 'T's add entries to this database's
-                                * tables hash table.
+                                * Arrange that following records add entries to this
+                                * database's hash tables.
                                 */
                                tabhash = dbentry->tables;
+                               funchash = dbentry->functions;
                                break;
 
                                /*
@@ -2479,6 +3594,7 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb)
                                 */
                        case 'd':
                                tabhash = NULL;
+                               funchash = NULL;
                                break;
 
                                /*
@@ -2489,28 +3605,64 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb)
                                                  fpin) != sizeof(PgStat_StatTabEntry))
                                {
                                        ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                                       (errmsg("corrupted pgstat.stat file")));
+                                                       (errmsg("corrupted statistics file \"%s\"",
+                                                                       statfile)));
                                        goto done;
                                }
 
                                /*
                                 * Skip if table belongs to a not requested database.
                                 */
-                               if (tabhash == NULL)
+                               if (tabhash == NULL)
+                                       break;
+
+                               tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
+                                                                                                       (void *) &tabbuf.tableid,
+                                                                                                                HASH_ENTER, &found);
+
+                               if (found)
+                               {
+                                       ereport(pgStatRunningInCollector ? LOG : WARNING,
+                                                       (errmsg("corrupted statistics file \"%s\"",
+                                                                       statfile)));
+                                       goto done;
+                               }
+
+                               memcpy(tabentry, &tabbuf, sizeof(tabbuf));
+                               break;
+
+                               /*
+                                * 'F'  A PgStat_StatFuncEntry follows.
+                                */
+                       case 'F':
+                               if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry),
+                                                 fpin) != sizeof(PgStat_StatFuncEntry))
+                               {
+                                       ereport(pgStatRunningInCollector ? LOG : WARNING,
+                                                       (errmsg("corrupted statistics file \"%s\"",
+                                                                       statfile)));
+                                       goto done;
+                               }
+
+                               /*
+                                * Skip if function belongs to a not requested database.
+                                */
+                               if (funchash == NULL)
                                        break;
 
-                               tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
-                                                                                                       (void *) &tabbuf.tableid,
+                               funcentry = (PgStat_StatFuncEntry *) hash_search(funchash,
+                                                                                               (void *) &funcbuf.functionid,
                                                                                                                 HASH_ENTER, &found);
 
                                if (found)
                                {
                                        ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                                       (errmsg("corrupted pgstat.stat file")));
+                                                       (errmsg("corrupted statistics file \"%s\"",
+                                                                       statfile)));
                                        goto done;
                                }
 
-                               memcpy(tabentry, &tabbuf, sizeof(tabbuf));
+                               memcpy(funcentry, &funcbuf, sizeof(funcbuf));
                                break;
 
                                /*
@@ -2521,51 +3673,204 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb)
 
                        default:
                                ereport(pgStatRunningInCollector ? LOG : WARNING,
-                                               (errmsg("corrupted pgstat.stat file")));
+                                               (errmsg("corrupted statistics file \"%s\"",
+                                                               statfile)));
                                goto done;
                }
        }
 
 done:
        FreeFile(fpin);
+
+       if (permanent)
+               unlink(PGSTAT_STAT_PERMANENT_FILENAME);
+
+       return dbhash;
 }
 
-/*
- * If not done for this transaction, read the statistics collector
- * stats file into some hash tables.
- *
- * Because we store the tables in TopTransactionContext, the result
- * is good for the entire current main transaction.
+/* ----------
+ * pgstat_read_statsfile_timestamp() -
  *
- * Inside the autovacuum process, the statfile is assumed to be valid
- * "forever", that is one iteration, within one database.  This means
- * we only consider the statistics as they were when the autovacuum
- * iteration started.
+ *     Attempt to fetch the timestamp of an existing stats file.
+ *     Returns TRUE if successful (timestamp is stored at *ts).
+ * ----------
  */
-static void
-backend_read_statsfile(void)
+static bool
+pgstat_read_statsfile_timestamp(bool permanent, TimestampTz *ts)
 {
-       if (IsAutoVacuumProcess())
+       PgStat_GlobalStats myGlobalStats;
+       FILE       *fpin;
+       int32           format_id;
+       const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
+
+       /*
+        * Try to open the status file.  As above, anything but ENOENT is worthy
+        * of complaining about.
+        */
+       if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
+       {
+               if (errno != ENOENT)
+                       ereport(pgStatRunningInCollector ? LOG : WARNING,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not open statistics file \"%s\": %m",
+                                                       statfile)));
+               return false;
+       }
+
+       /*
+        * Verify it's of the expected format.
+        */
+       if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id)
+               || format_id != PGSTAT_FILE_FORMAT_ID)
+       {
+               ereport(pgStatRunningInCollector ? LOG : WARNING,
+                               (errmsg("corrupted statistics file \"%s\"", statfile)));
+               FreeFile(fpin);
+               return false;
+       }
+
+       /*
+        * Read global stats struct
+        */
+       if (fread(&myGlobalStats, 1, sizeof(myGlobalStats), fpin) != sizeof(myGlobalStats))
        {
-               /* already read it? */
-               if (pgStatDBHash)
-                       return;
-               Assert(!pgStatRunningInCollector);
-               pgstat_read_statsfile(&pgStatDBHash, InvalidOid);
+               ereport(pgStatRunningInCollector ? LOG : WARNING,
+                               (errmsg("corrupted statistics file \"%s\"", statfile)));
+               FreeFile(fpin);
+               return false;
        }
+
+       *ts = myGlobalStats.stats_timestamp;
+
+       FreeFile(fpin);
+       return true;
+}
+
+/*
+ * If not already done, read the statistics collector stats file into
+ * some hash tables.  The results will be kept until pgstat_clear_snapshot()
+ * is called (typically, at end of transaction).
+ */
+static void
+backend_read_statsfile(void)
+{
+       TimestampTz min_ts;
+       int                     count;
+
+       /* already read it? */
+       if (pgStatDBHash)
+               return;
+       Assert(!pgStatRunningInCollector);
+
+       /*
+        * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL msec
+        * before now.  This indirectly ensures that the collector needn't write
+        * the file more often than PGSTAT_STAT_INTERVAL.  In an autovacuum
+        * worker, however, we want a lower delay to avoid using stale data, so we
+        * use PGSTAT_RETRY_DELAY (since the number of worker is low, this
+        * shouldn't be a problem).
+        *
+        * Note that we don't recompute min_ts after sleeping; so we might end up
+        * accepting a file a bit older than PGSTAT_STAT_INTERVAL.      In practice
+        * that shouldn't happen, though, as long as the sleep time is less than
+        * PGSTAT_STAT_INTERVAL; and we don't want to lie to the collector about
+        * what our cutoff time really is.
+        */
+       if (IsAutoVacuumWorkerProcess())
+               min_ts = TimestampTzPlusMilliseconds(GetCurrentTimestamp(),
+                                                                                        -PGSTAT_RETRY_DELAY);
        else
+               min_ts = TimestampTzPlusMilliseconds(GetCurrentTimestamp(),
+                                                                                        -PGSTAT_STAT_INTERVAL);
+
+       /*
+        * Loop until fresh enough stats file is available or we ran out of time.
+        * The stats inquiry message is sent repeatedly in case collector drops
+        * it.
+        */
+       for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++)
        {
-               TransactionId topXid = GetTopTransactionId();
+               TimestampTz file_ts = 0;
 
-               if (!TransactionIdEquals(pgStatDBHashXact, topXid))
-               {
-                       Assert(!pgStatRunningInCollector);
-                       pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId);
-                       pgStatDBHashXact = topXid;
-               }
+               CHECK_FOR_INTERRUPTS();
+
+               if (pgstat_read_statsfile_timestamp(false, &file_ts) &&
+                       file_ts >= min_ts)
+                       break;
+
+               /* Not there or too old, so kick the collector and wait a bit */
+               pgstat_send_inquiry(min_ts);
+               pg_usleep(PGSTAT_RETRY_DELAY * 1000L);
        }
+
+       if (count >= PGSTAT_POLL_LOOP_COUNT)
+               elog(WARNING, "pgstat wait timeout");
+
+       /* Autovacuum launcher wants stats about all databases */
+       if (IsAutoVacuumLauncherProcess())
+               pgStatDBHash = pgstat_read_statsfile(InvalidOid, false);
+       else
+               pgStatDBHash = pgstat_read_statsfile(MyDatabaseId, false);
+}
+
+
+/* ----------
+ * pgstat_setup_memcxt() -
+ *
+ *     Create pgStatLocalContext, if not already done.
+ * ----------
+ */
+static void
+pgstat_setup_memcxt(void)
+{
+       if (!pgStatLocalContext)
+               pgStatLocalContext = AllocSetContextCreate(TopMemoryContext,
+                                                                                                  "Statistics snapshot",
+                                                                                                  ALLOCSET_SMALL_MINSIZE,
+                                                                                                  ALLOCSET_SMALL_INITSIZE,
+                                                                                                  ALLOCSET_SMALL_MAXSIZE);
+}
+
+
+/* ----------
+ * pgstat_clear_snapshot() -
+ *
+ *     Discard any data collected in the current transaction.  Any subsequent
+ *     request will cause new snapshots to be read.
+ *
+ *     This is also invoked during transaction commit or abort to discard
+ *     the no-longer-wanted snapshot.
+ * ----------
+ */
+void
+pgstat_clear_snapshot(void)
+{
+       /* Release memory, if any was allocated */
+       if (pgStatLocalContext)
+               MemoryContextDelete(pgStatLocalContext);
+
+       /* Reset variables */
+       pgStatLocalContext = NULL;
+       pgStatDBHash = NULL;
+       localBackendStatusTable = NULL;
+       localNumBackends = 0;
 }
 
+
+/* ----------
+ * pgstat_recv_inquiry() -
+ *
+ *     Process stat inquiry requests.
+ * ----------
+ */
+static void
+pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
+{
+       if (msg->inquiry_time > last_statrequest)
+               last_statrequest = msg->inquiry_time;
+}
+
+
 /* ----------
  * pgstat_recv_tabstat() -
  *
@@ -2575,7 +3880,6 @@ backend_read_statsfile(void)
 static void
 pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 {
-       PgStat_TableEntry *tabmsg = &(msg->m_entry[0]);
        PgStat_StatDBEntry *dbentry;
        PgStat_StatTabEntry *tabentry;
        int                     i;
@@ -2594,8 +3898,10 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
         */
        for (i = 0; i < msg->m_nentries; i++)
        {
+               PgStat_TableEntry *tabmsg = &(msg->m_entry[i]);
+
                tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-                                                                                                 (void *) &(tabmsg[i].t_id),
+                                                                                                       (void *) &(tabmsg->t_id),
                                                                                                           HASH_ENTER, &found);
 
                if (!found)
@@ -2604,50 +3910,62 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
                         * If it's a new table entry, initialize counters to the values we
                         * just got.
                         */
-                       tabentry->numscans = tabmsg[i].t_numscans;
-                       tabentry->tuples_returned = tabmsg[i].t_tuples_returned;
-                       tabentry->tuples_fetched = tabmsg[i].t_tuples_fetched;
-                       tabentry->tuples_inserted = tabmsg[i].t_tuples_inserted;
-                       tabentry->tuples_updated = tabmsg[i].t_tuples_updated;
-                       tabentry->tuples_deleted = tabmsg[i].t_tuples_deleted;
-
-                       tabentry->n_live_tuples = tabmsg[i].t_tuples_inserted;
-                       tabentry->n_dead_tuples = tabmsg[i].t_tuples_updated +
-                               tabmsg[i].t_tuples_deleted;
-                       tabentry->last_anl_tuples = 0;
+                       tabentry->numscans = tabmsg->t_counts.t_numscans;
+                       tabentry->tuples_returned = tabmsg->t_counts.t_tuples_returned;
+                       tabentry->tuples_fetched = tabmsg->t_counts.t_tuples_fetched;
+                       tabentry->tuples_inserted = tabmsg->t_counts.t_tuples_inserted;
+                       tabentry->tuples_updated = tabmsg->t_counts.t_tuples_updated;
+                       tabentry->tuples_deleted = tabmsg->t_counts.t_tuples_deleted;
+                       tabentry->tuples_hot_updated = tabmsg->t_counts.t_tuples_hot_updated;
+                       tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples;
+                       tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples;
+                       tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples;
+                       tabentry->blocks_fetched = tabmsg->t_counts.t_blocks_fetched;
+                       tabentry->blocks_hit = tabmsg->t_counts.t_blocks_hit;
+
                        tabentry->vacuum_timestamp = 0;
+                       tabentry->vacuum_count = 0;
                        tabentry->autovac_vacuum_timestamp = 0;
+                       tabentry->autovac_vacuum_count = 0;
                        tabentry->analyze_timestamp = 0;
+                       tabentry->analyze_count = 0;
                        tabentry->autovac_analyze_timestamp = 0;
-
-                       tabentry->blocks_fetched = tabmsg[i].t_blocks_fetched;
-                       tabentry->blocks_hit = tabmsg[i].t_blocks_hit;
+                       tabentry->autovac_analyze_count = 0;
                }
                else
                {
                        /*
                         * Otherwise add the values to the existing entry.
                         */
-                       tabentry->numscans += tabmsg[i].t_numscans;
-                       tabentry->tuples_returned += tabmsg[i].t_tuples_returned;
-                       tabentry->tuples_fetched += tabmsg[i].t_tuples_fetched;
-                       tabentry->tuples_inserted += tabmsg[i].t_tuples_inserted;
-                       tabentry->tuples_updated += tabmsg[i].t_tuples_updated;
-                       tabentry->tuples_deleted += tabmsg[i].t_tuples_deleted;
-
-                       tabentry->n_live_tuples += tabmsg[i].t_tuples_inserted;
-                       tabentry->n_dead_tuples += tabmsg[i].t_tuples_updated +
-                               tabmsg[i].t_tuples_deleted;
-
-                       tabentry->blocks_fetched += tabmsg[i].t_blocks_fetched;
-                       tabentry->blocks_hit += tabmsg[i].t_blocks_hit;
+                       tabentry->numscans += tabmsg->t_counts.t_numscans;
+                       tabentry->tuples_returned += tabmsg->t_counts.t_tuples_returned;
+                       tabentry->tuples_fetched += tabmsg->t_counts.t_tuples_fetched;
+                       tabentry->tuples_inserted += tabmsg->t_counts.t_tuples_inserted;
+                       tabentry->tuples_updated += tabmsg->t_counts.t_tuples_updated;
+                       tabentry->tuples_deleted += tabmsg->t_counts.t_tuples_deleted;
+                       tabentry->tuples_hot_updated += tabmsg->t_counts.t_tuples_hot_updated;
+                       tabentry->n_live_tuples += tabmsg->t_counts.t_delta_live_tuples;
+                       tabentry->n_dead_tuples += tabmsg->t_counts.t_delta_dead_tuples;
+                       tabentry->changes_since_analyze += tabmsg->t_counts.t_changed_tuples;
+                       tabentry->blocks_fetched += tabmsg->t_counts.t_blocks_fetched;
+                       tabentry->blocks_hit += tabmsg->t_counts.t_blocks_hit;
                }
 
+               /* Clamp n_live_tuples in case of negative delta_live_tuples */
+               tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
+               /* Likewise for n_dead_tuples */
+               tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
+
                /*
-                * And add the block IO to the database entry.
+                * Add per-table stats to the per-database entry, too.
                 */
-               dbentry->n_blocks_fetched += tabmsg[i].t_blocks_fetched;
-               dbentry->n_blocks_hit += tabmsg[i].t_blocks_hit;
+               dbentry->n_tuples_returned += tabmsg->t_counts.t_tuples_returned;
+               dbentry->n_tuples_fetched += tabmsg->t_counts.t_tuples_fetched;
+               dbentry->n_tuples_inserted += tabmsg->t_counts.t_tuples_inserted;
+               dbentry->n_tuples_updated += tabmsg->t_counts.t_tuples_updated;
+               dbentry->n_tuples_deleted += tabmsg->t_counts.t_tuples_deleted;
+               dbentry->n_blocks_fetched += tabmsg->t_counts.t_blocks_fetched;
+               dbentry->n_blocks_hit += tabmsg->t_counts.t_blocks_hit;
        }
 }
 
@@ -2708,6 +4026,8 @@ pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
        {
                if (dbentry->tables != NULL)
                        hash_destroy(dbentry->tables);
+               if (dbentry->functions != NULL)
+                       hash_destroy(dbentry->functions);
 
                if (hash_search(pgStatDBHash,
                                                (void *) &(dbentry->databaseid),
@@ -2745,12 +4065,28 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
         */
        if (dbentry->tables != NULL)
                hash_destroy(dbentry->tables);
+       if (dbentry->functions != NULL)
+               hash_destroy(dbentry->functions);
 
        dbentry->tables = NULL;
+       dbentry->functions = NULL;
+
+       /*
+        * Reset database-level stats too.      This should match the initialization
+        * code in pgstat_get_db_entry().
+        */
        dbentry->n_xact_commit = 0;
        dbentry->n_xact_rollback = 0;
        dbentry->n_blocks_fetched = 0;
        dbentry->n_blocks_hit = 0;
+       dbentry->n_tuples_returned = 0;
+       dbentry->n_tuples_fetched = 0;
+       dbentry->n_tuples_inserted = 0;
+       dbentry->n_tuples_updated = 0;
+       dbentry->n_tuples_deleted = 0;
+       dbentry->last_autovac_time = 0;
+
+       dbentry->stat_reset_timestamp = GetCurrentTimestamp();
 
        memset(&hash_ctl, 0, sizeof(hash_ctl));
        hash_ctl.keysize = sizeof(Oid);
@@ -2760,33 +4096,82 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
                                                                  PGSTAT_TAB_HASH_SIZE,
                                                                  &hash_ctl,
                                                                  HASH_ELEM | HASH_FUNCTION);
+
+       hash_ctl.keysize = sizeof(Oid);
+       hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
+       hash_ctl.hash = oid_hash;
+       dbentry->functions = hash_create("Per-database function",
+                                                                        PGSTAT_FUNCTION_HASH_SIZE,
+                                                                        &hash_ctl,
+                                                                        HASH_ELEM | HASH_FUNCTION);
 }
 
 /* ----------
- * pgstat_recv_autovac() -
+ * pgstat_recv_resetshared() -
  *
- *     Process an autovacuum signalling message.
+ *     Reset some shared statistics of the cluster.
  * ----------
  */
 static void
-pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
+pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 {
-       PgStat_StatDBEntry *dbentry;
+       if (msg->m_resettarget == RESET_BGWRITER)
+       {
+               /* Reset the global background writer statistics for the cluster. */
+               memset(&globalStats, 0, sizeof(globalStats));
+               globalStats.stat_reset_timestamp = GetCurrentTimestamp();
+       }
 
        /*
-        * Lookup the database in the hashtable.  Don't create the entry if it
-        * doesn't exist, because autovacuum may be processing a template
-        * database.  If this isn't the case, the database is most likely to have
-        * an entry already.  (If it doesn't, not much harm is done anyway --
-        * it'll get created as soon as somebody actually uses the database.)
+        * Presumably the sender of this message validated the target, don't
+        * complain here if it's not valid
         */
+}
+
+/* ----------
+ * pgstat_recv_resetsinglecounter() -
+ *
+ *     Reset a statistics for a single object
+ * ----------
+ */
+static void
+pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len)
+{
+       PgStat_StatDBEntry *dbentry;
+
        dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-       if (dbentry == NULL)
+
+       if (!dbentry)
                return;
 
+       /* Set the reset timestamp for the whole database */
+       dbentry->stat_reset_timestamp = GetCurrentTimestamp();
+
+       /* Remove object if it exists, ignore it if not */
+       if (msg->m_resettype == RESET_TABLE)
+               (void) hash_search(dbentry->tables, (void *) &(msg->m_objectid),
+                                                  HASH_REMOVE, NULL);
+       else if (msg->m_resettype == RESET_FUNCTION)
+               (void) hash_search(dbentry->functions, (void *) &(msg->m_objectid),
+                                                  HASH_REMOVE, NULL);
+}
+
+/* ----------
+ * pgstat_recv_autovac() -
+ *
+ *     Process an autovacuum signalling message.
+ * ----------
+ */
+static void
+pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
+{
+       PgStat_StatDBEntry *dbentry;
+
        /*
-        * Store the last autovacuum time in the database entry.
+        * Store the last autovacuum time in the database's hashtable entry.
         */
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
+
        dbentry->last_autovac_time = msg->m_start_time;
 }
 
@@ -2803,32 +4188,25 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
        PgStat_StatTabEntry *tabentry;
 
        /*
-        * Don't create either the database or table entry if it doesn't already
-        * exist.  This avoids bloating the stats with entries for stuff that is
-        * only touched by vacuum and not by live operations.
+        * Store the data in the table's hashtable entry.
         */
-       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-       if (dbentry == NULL)
-               return;
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
-       tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
-                                                  HASH_FIND, NULL);
-       if (tabentry == NULL)
-               return;
+       tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
 
-       if (msg->m_autovacuum) 
-               tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime;
-       else 
-               tabentry->vacuum_timestamp = msg->m_vacuumtime; 
        tabentry->n_live_tuples = msg->m_tuples;
+       /* Resetting dead_tuples to 0 is an approximation ... */
        tabentry->n_dead_tuples = 0;
-       if (msg->m_analyze)
+
+       if (msg->m_autovacuum)
        {
-               tabentry->last_anl_tuples = msg->m_tuples;
-               if (msg->m_autovacuum)
-                       tabentry->autovac_analyze_timestamp = msg->m_vacuumtime;
-               else
-                       tabentry->analyze_timestamp = msg->m_vacuumtime;
+               tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime;
+               tabentry->autovac_vacuum_count++;
+       }
+       else
+       {
+               tabentry->vacuum_timestamp = msg->m_vacuumtime;
+               tabentry->vacuum_count++;
        }
 }
 
@@ -2845,24 +4223,169 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
        PgStat_StatTabEntry *tabentry;
 
        /*
-        * Don't create either the database or table entry if it doesn't already
-        * exist.  This avoids bloating the stats with entries for stuff that is
-        * only touched by analyze and not by live operations.
+        * Store the data in the table's hashtable entry.
         */
-       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-       if (dbentry == NULL)
-               return;
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
-       tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
-                                                  HASH_FIND, NULL);
-       if (tabentry == NULL)
-               return;
+       tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
 
-       if (msg->m_autovacuum) 
-               tabentry->autovac_analyze_timestamp = msg->m_analyzetime;
-       else 
-               tabentry->analyze_timestamp = msg->m_analyzetime;
        tabentry->n_live_tuples = msg->m_live_tuples;
        tabentry->n_dead_tuples = msg->m_dead_tuples;
-       tabentry->last_anl_tuples = msg->m_live_tuples + msg->m_dead_tuples;
+
+       /*
+        * We reset changes_since_analyze to zero, forgetting any changes that
+        * occurred while the ANALYZE was in progress.
+        */
+       tabentry->changes_since_analyze = 0;
+
+       if (msg->m_autovacuum)
+       {
+               tabentry->autovac_analyze_timestamp = msg->m_analyzetime;
+               tabentry->autovac_analyze_count++;
+       }
+       else
+       {
+               tabentry->analyze_timestamp = msg->m_analyzetime;
+               tabentry->analyze_count++;
+       }
+}
+
+
+/* ----------
+ * pgstat_recv_bgwriter() -
+ *
+ *     Process a BGWRITER message.
+ * ----------
+ */
+static void
+pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
+{
+       globalStats.timed_checkpoints += msg->m_timed_checkpoints;
+       globalStats.requested_checkpoints += msg->m_requested_checkpoints;
+       globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
+       globalStats.buf_written_clean += msg->m_buf_written_clean;
+       globalStats.maxwritten_clean += msg->m_maxwritten_clean;
+       globalStats.buf_written_backend += msg->m_buf_written_backend;
+       globalStats.buf_fsync_backend += msg->m_buf_fsync_backend;
+       globalStats.buf_alloc += msg->m_buf_alloc;
+}
+
+/* ----------
+ * pgstat_recv_recoveryconflict() -
+ *
+ *     Process as RECOVERYCONFLICT message.
+ * ----------
+ */
+static void
+pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len)
+{
+       PgStat_StatDBEntry *dbentry;
+
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
+
+       switch (msg->m_reason)
+       {
+               case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+
+                       /*
+                        * Since we drop the information about the database as soon as it
+                        * replicates, there is no point in counting these conflicts.
+                        */
+                       break;
+               case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+                       dbentry->n_conflict_tablespace++;
+                       break;
+               case PROCSIG_RECOVERY_CONFLICT_LOCK:
+                       dbentry->n_conflict_lock++;
+                       break;
+               case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+                       dbentry->n_conflict_snapshot++;
+                       break;
+               case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+                       dbentry->n_conflict_bufferpin++;
+                       break;
+               case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+                       dbentry->n_conflict_startup_deadlock++;
+                       break;
+       }
+}
+
+/* ----------
+ * pgstat_recv_funcstat() -
+ *
+ *     Count what the backend has done.
+ * ----------
+ */
+static void
+pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
+{
+       PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]);
+       PgStat_StatDBEntry *dbentry;
+       PgStat_StatFuncEntry *funcentry;
+       int                     i;
+       bool            found;
+
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
+
+       /*
+        * Process all function entries in the message.
+        */
+       for (i = 0; i < msg->m_nentries; i++, funcmsg++)
+       {
+               funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
+                                                                                                  (void *) &(funcmsg->f_id),
+                                                                                                                HASH_ENTER, &found);
+
+               if (!found)
+               {
+                       /*
+                        * If it's a new function entry, initialize counters to the values
+                        * we just got.
+                        */
+                       funcentry->f_numcalls = funcmsg->f_numcalls;
+                       funcentry->f_time = funcmsg->f_time;
+                       funcentry->f_time_self = funcmsg->f_time_self;
+               }
+               else
+               {
+                       /*
+                        * Otherwise add the values to the existing entry.
+                        */
+                       funcentry->f_numcalls += funcmsg->f_numcalls;
+                       funcentry->f_time += funcmsg->f_time;
+                       funcentry->f_time_self += funcmsg->f_time_self;
+               }
+       }
+}
+
+/* ----------
+ * pgstat_recv_funcpurge() -
+ *
+ *     Arrange for dead function removal.
+ * ----------
+ */
+static void
+pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len)
+{
+       PgStat_StatDBEntry *dbentry;
+       int                     i;
+
+       dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
+
+       /*
+        * No need to purge if we don't even know the database.
+        */
+       if (!dbentry || !dbentry->functions)
+               return;
+
+       /*
+        * Process all function entries in the message.
+        */
+       for (i = 0; i < msg->m_nentries; i++)
+       {
+               /* Remove from hashtable if present; we don't care if it's not. */
+               (void) hash_search(dbentry->functions,
+                                                  (void *) &(msg->m_functionid[i]),
+                                                  HASH_REMOVE, NULL);
+       }
 }