]> granicus.if.org Git - postgresql/blobdiff - src/backend/postmaster/pgstat.c
Fix up pgstats counting of live and dead tuples to recognize that committed
[postgresql] / src / backend / postmaster / pgstat.c
index d6e8cd220a0e720183559a32ce13e1417775941f..b41a16de44ce86435068597a40e0fa3537ccd08b 100644 (file)
@@ -13,7 +13,7 @@
  *
  *     Copyright (c) 2001-2007, PostgreSQL Global Development Group
  *
- *     $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.147 2007/02/15 23:23:23 alvherre Exp $
+ *     $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.156 2007/05/27 03:50:39 tgl Exp $
  * ----------
  */
 #include "postgres.h"
@@ -39,6 +39,7 @@
 
 #include "access/heapam.h"
 #include "access/transam.h"
+#include "access/twophase_rmgr.h"
 #include "access/xact.h"
 #include "catalog/pg_database.h"
 #include "libpq/ip.h"
@@ -98,6 +99,13 @@ bool         pgstat_collect_tuplelevel = false;
 bool           pgstat_collect_blocklevel = false;
 bool           pgstat_collect_querystring = false;
 
+/*
+ * BgWriter global statistics counters (unused in other processes).
+ * Stored directly in a stats message structure so it can be sent
+ * without needing to copy things around.  We assume this inits to zeroes.
+ */
+PgStat_MsgBgWriter BgWriterStats;
+
 /* ----------
  * Local data
  * ----------
@@ -111,30 +119,70 @@ static time_t last_pgstat_start_time;
 static bool pgStatRunningInCollector = false;
 
 /*
- * Place where backends store per-table info to be sent to the collector.
- * We store shared relations separately from non-shared ones, to be able to
- * send them in separate messages.
+ * Structures in which backends store per-table info that's waiting to be
+ * sent to the collector.
+ *
+ * NOTE: once allocated, TabStatusArray structures are never moved or deleted
+ * for the life of the backend.  Also, we zero out the t_id fields of the
+ * contained PgStat_TableStatus structs whenever they are not actively in use.
+ * This allows relcache pgstat_info pointers to be treated as long-lived data,
+ * avoiding repeated searches in pgstat_initstats() when a relation is
+ * repeatedly opened during a transaction.
  */
-typedef struct TabStatArray
+#define TABSTAT_QUANTUM                100                     /* we alloc this many at a time */
+
+typedef struct TabStatusArray
 {
-       int                     tsa_alloc;              /* num allocated */
-       int                     tsa_used;               /* num actually used */
-       PgStat_MsgTabstat **tsa_messages;       /* the array itself */
-} TabStatArray;
+       struct TabStatusArray *tsa_next;        /* link to next array, if any */
+       int                     tsa_used;                               /* # entries currently used */
+       PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM];        /* per-table data */
+} TabStatusArray;
 
-#define TABSTAT_QUANTUM                4       /* we alloc this many at a time */
+static TabStatusArray *pgStatTabList = NULL;
 
-static TabStatArray RegularTabStat = {0, 0, NULL};
-static TabStatArray SharedTabStat = {0, 0, NULL};
+/*
+ * Tuple insertion/deletion counts for an open transaction can't be propagated
+ * into PgStat_TableStatus counters until we know if it is going to commit
+ * or abort.  Hence, we keep these counts in per-subxact structs that live
+ * in TopTransactionContext.  This data structure is designed on the assumption
+ * that subxacts won't usually modify very many tables.
+ */
+typedef struct PgStat_SubXactStatus
+{
+       int                     nest_level;                             /* subtransaction nest level */
+       struct PgStat_SubXactStatus *prev;      /* higher-level subxact if any */
+       PgStat_TableXactStatus *first;          /* head of list for this subxact */
+} PgStat_SubXactStatus;
+
+static PgStat_SubXactStatus *pgStatXactStack = NULL;
 
 static int     pgStatXactCommit = 0;
 static int     pgStatXactRollback = 0;
 
+/* Record that's written to 2PC state file when pgstat state is persisted */
+typedef struct TwoPhasePgStatRecord
+{
+       PgStat_Counter tuples_inserted; /* tuples inserted in xact */
+       PgStat_Counter tuples_deleted;  /* tuples deleted in xact */
+       Oid                     t_id;                           /* table's OID */
+       bool            t_shared;                       /* is it a shared catalog? */
+} TwoPhasePgStatRecord;
+
+/*
+ * Info about current "snapshot" of stats file
+ */
 static MemoryContext pgStatLocalContext = NULL;
 static HTAB *pgStatDBHash = NULL;
 static PgBackendStatus *localBackendStatusTable = NULL;
 static int     localNumBackends = 0;
 
+/*
+ * Cluster wide statistics, kept in the stats collector.
+ * Contains statistics that are not collected per database
+ * or per table.
+ */
+static PgStat_GlobalStats globalStats;
+
 static volatile bool need_exit = false;
 static volatile bool need_statwrite = false;
 
@@ -157,8 +205,12 @@ static void pgstat_write_statsfile(void);
 static HTAB *pgstat_read_statsfile(Oid onlydb);
 static void backend_read_statsfile(void);
 static void pgstat_read_current_status(void);
+
+static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
 static HTAB *pgstat_collect_oids(Oid catalogid);
 
+static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
+
 static void pgstat_setup_memcxt(void);
 
 static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
@@ -171,6 +223,7 @@ static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
 static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
 static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
+static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
 
 
 /* ------------------------------------------------------------
@@ -572,6 +625,10 @@ pgstat_start(void)
        return 0;
 }
 
+void allow_immediate_pgstat_restart(void)
+{
+               last_pgstat_start_time = 0;
+}
 
 /* ------------------------------------------------------------
  * Public functions used by backends follow
@@ -582,70 +639,136 @@ pgstat_start(void)
 /* ----------
  * pgstat_report_tabstat() -
  *
- *     Called from tcop/postgres.c to send the so far collected
- *     per table access statistics to the collector.
+ *     Called from tcop/postgres.c to send the so far collected per-table
+ *     access statistics to the collector.  Note that this is called only
+ *     when not within a transaction, so it is fair to use transaction stop
+ *     time as an approximation of current time.
  * ----------
  */
 void
-pgstat_report_tabstat(void)
+pgstat_report_tabstat(bool force)
 {
+       /* we assume this inits to all zeroes: */
+       static const PgStat_TableCounts all_zeroes;
+       static TimestampTz last_report = 0;     
+
+       TimestampTz now;
+       PgStat_MsgTabstat regular_msg;
+       PgStat_MsgTabstat shared_msg;
+       TabStatusArray *tsa;
        int                     i;
 
-       if (pgStatSock < 0 ||
-               (!pgstat_collect_tuplelevel &&
-                !pgstat_collect_blocklevel))
-       {
-               /* Not reporting stats, so just flush whatever we have */
-               RegularTabStat.tsa_used = 0;
-               SharedTabStat.tsa_used = 0;
+       /* Don't expend a clock check if nothing to do */
+       if (pgStatTabList == NULL ||
+               pgStatTabList->tsa_used == 0)
                return;
-       }
 
        /*
-        * For each message buffer used during the last query set the header
-        * fields and send it out.
+        * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
+        * msec since we last sent one, or the caller wants to force stats out.
+        */
+       now = GetCurrentTransactionStopTimestamp();
+       if (!force &&
+               !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL))
+               return;
+       last_report = now;
+
+       /*
+        * Scan through the TabStatusArray struct(s) to find tables that actually
+        * have counts, and build messages to send.  We have to separate shared
+        * relations from regular ones because the databaseid field in the
+        * message header has to depend on that.
         */
-       for (i = 0; i < RegularTabStat.tsa_used; i++)
+       regular_msg.m_databaseid = MyDatabaseId;
+       shared_msg.m_databaseid = InvalidOid;
+       regular_msg.m_nentries = 0;
+       shared_msg.m_nentries = 0;
+
+       for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
        {
-               PgStat_MsgTabstat *tsmsg = RegularTabStat.tsa_messages[i];
-               int                     n;
-               int                     len;
+               for (i = 0; i < tsa->tsa_used; i++)
+               {
+                       PgStat_TableStatus *entry = &tsa->tsa_entries[i];
+                       PgStat_MsgTabstat *this_msg;
+                       PgStat_TableEntry *this_ent;
+
+                       /* Shouldn't have any pending transaction-dependent counts */
+                       Assert(entry->trans == NULL);
+
+                       /*
+                        * Ignore entries that didn't accumulate any actual counts,
+                        * such as indexes that were opened by the planner but not used.
+                        */
+                       if (memcmp(&entry->t_counts, &all_zeroes,
+                                          sizeof(PgStat_TableCounts)) == 0)
+                               continue;
+                       /*
+                        * OK, insert data into the appropriate message, and send if full.
+                        */
+                       this_msg = entry->t_shared ? &shared_msg : &regular_msg;
+                       this_ent = &this_msg->m_entry[this_msg->m_nentries];
+                       this_ent->t_id = entry->t_id;
+                       memcpy(&this_ent->t_counts, &entry->t_counts,
+                                  sizeof(PgStat_TableCounts));
+                       if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
+                       {
+                               pgstat_send_tabstat(this_msg);
+                               this_msg->m_nentries = 0;
+                       }
+               }
+               /* zero out TableStatus structs after use */
+               MemSet(tsa->tsa_entries, 0,
+                          tsa->tsa_used * sizeof(PgStat_TableStatus));
+               tsa->tsa_used = 0;
+       }
 
-               n = tsmsg->m_nentries;
-               len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
-                       n * sizeof(PgStat_TableEntry);
+       /*
+        * Send partial messages.  If force is true, make sure that any pending
+        * xact commit/abort gets counted, even if no table stats to send.
+        */
+       if (regular_msg.m_nentries > 0 ||
+               (force && (pgStatXactCommit > 0 || pgStatXactRollback > 0)))
+               pgstat_send_tabstat(&regular_msg);
+       if (shared_msg.m_nentries > 0)
+               pgstat_send_tabstat(&shared_msg);
+}
 
+/*
+ * Subroutine for pgstat_report_tabstat: finish and send a tabstat message
+ */
+static void
+pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
+{
+       int                     n;
+       int                     len;
+
+       /* It's unlikely we'd get here with no socket, but maybe not impossible */
+       if (pgStatSock < 0)
+               return;
+
+       /*
+        * Report accumulated xact commit/rollback whenever we send a normal
+        * tabstat message
+        */
+       if (OidIsValid(tsmsg->m_databaseid))
+       {
                tsmsg->m_xact_commit = pgStatXactCommit;
                tsmsg->m_xact_rollback = pgStatXactRollback;
                pgStatXactCommit = 0;
                pgStatXactRollback = 0;
-
-               pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
-               tsmsg->m_databaseid = MyDatabaseId;
-               pgstat_send(tsmsg, len);
        }
-       RegularTabStat.tsa_used = 0;
-
-       /* Ditto, for shared relations */
-       for (i = 0; i < SharedTabStat.tsa_used; i++)
+       else
        {
-               PgStat_MsgTabstat *tsmsg = SharedTabStat.tsa_messages[i];
-               int                     n;
-               int                     len;
-
-               n = tsmsg->m_nentries;
-               len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
-                       n * sizeof(PgStat_TableEntry);
-
-               /* We don't report transaction commit/abort here */
                tsmsg->m_xact_commit = 0;
                tsmsg->m_xact_rollback = 0;
-
-               pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
-               tsmsg->m_databaseid = InvalidOid;
-               pgstat_send(tsmsg, len);
        }
-       SharedTabStat.tsa_used = 0;
+
+       n = tsmsg->m_nentries;
+       len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
+               n * sizeof(PgStat_TableEntry);
+
+       pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
+       pgstat_send(tsmsg, len);
 }
 
 
@@ -981,186 +1104,489 @@ pgstat_ping(void)
        pgstat_send(&msg, sizeof(msg));
 }
 
-/*
- * Enlarge a TabStatArray
- */
-static void
-more_tabstat_space(TabStatArray *tsarr)
-{
-       PgStat_MsgTabstat *newMessages;
-       PgStat_MsgTabstat **msgArray;
-       int                     newAlloc;
-       int                     i;
-
-       AssertArg(PointerIsValid(tsarr));
-
-       newAlloc = tsarr->tsa_alloc + TABSTAT_QUANTUM;
-
-       /* Create (another) quantum of message buffers */
-       newMessages = (PgStat_MsgTabstat *)
-               MemoryContextAllocZero(TopMemoryContext,
-                                                          sizeof(PgStat_MsgTabstat) * TABSTAT_QUANTUM);
-
-       /* Create or enlarge the pointer array */
-       if (tsarr->tsa_messages == NULL)
-               msgArray = (PgStat_MsgTabstat **)
-                       MemoryContextAlloc(TopMemoryContext,
-                                                          sizeof(PgStat_MsgTabstat *) * newAlloc);
-       else
-               msgArray = (PgStat_MsgTabstat **)
-                       repalloc(tsarr->tsa_messages,
-                                        sizeof(PgStat_MsgTabstat *) * newAlloc);
-
-       for (i = 0; i < TABSTAT_QUANTUM; i++)
-               msgArray[tsarr->tsa_alloc + i] = newMessages++;
-       tsarr->tsa_messages = msgArray;
-       tsarr->tsa_alloc = newAlloc;
-
-       Assert(tsarr->tsa_used < tsarr->tsa_alloc);
-}
 
 /* ----------
  * pgstat_initstats() -
  *
- *     Called from various places usually dealing with initialization
- *     of Relation or Scan structures. The data placed into these
- *     structures from here tell where later to count for buffer reads,
- *     scans and tuples fetched.
+ *     Initialize a relcache entry to count access statistics.
+ *     Called whenever a relation is opened.
+ *
+ *     We assume that a relcache entry's pgstat_info field is zeroed by
+ *     relcache.c when the relcache entry is made; thereafter it is long-lived
+ *     data.  We can avoid repeated searches of the TabStatus arrays when the
+ *     same relation is touched repeatedly within a transaction.
  * ----------
  */
 void
-pgstat_initstats(PgStat_Info *stats, Relation rel)
+pgstat_initstats(Relation rel)
 {
        Oid                     rel_id = rel->rd_id;
-       PgStat_TableEntry *useent;
-       TabStatArray *tsarr;
-       PgStat_MsgTabstat *tsmsg;
-       int                     mb;
-       int                     i;
+       char            relkind = rel->rd_rel->relkind;
 
-       /*
-        * Initialize data not to count at all.
-        */
-       stats->tabentry = NULL;
+       /* We only count stats for things that have storage */
+       if (!(relkind == RELKIND_RELATION ||
+                 relkind == RELKIND_INDEX ||
+                 relkind == RELKIND_TOASTVALUE))
+       {
+               rel->pgstat_info = NULL;
+               return;
+       }
 
        if (pgStatSock < 0 ||
                !(pgstat_collect_tuplelevel ||
                  pgstat_collect_blocklevel))
+       {
+               /* We're not counting at all */
+               rel->pgstat_info = NULL;
                return;
+       }
 
-       tsarr = rel->rd_rel->relisshared ? &SharedTabStat : &RegularTabStat;
+       /*
+        * If we already set up this relation in the current transaction,
+        * nothing to do.
+        */
+       if (rel->pgstat_info != NULL &&
+               rel->pgstat_info->t_id == rel_id)
+               return;
+
+       /* Else find or make the PgStat_TableStatus entry, and update link */
+       rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
+}
+
+/*
+ * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
+ */
+static PgStat_TableStatus *
+get_tabstat_entry(Oid rel_id, bool isshared)
+{
+       PgStat_TableStatus *entry;
+       TabStatusArray *tsa;
+       TabStatusArray *prev_tsa;
+       int                     i;
 
        /*
-        * Search the already-used message slots for this relation.
+        * Search the already-used tabstat slots for this relation.
         */
-       for (mb = 0; mb < tsarr->tsa_used; mb++)
+       prev_tsa = NULL;
+       for (tsa = pgStatTabList; tsa != NULL; prev_tsa = tsa, tsa = tsa->tsa_next)
        {
-               tsmsg = tsarr->tsa_messages[mb];
-
-               for (i = tsmsg->m_nentries; --i >= 0;)
+               for (i = 0; i < tsa->tsa_used; i++)
                {
-                       if (tsmsg->m_entry[i].t_id == rel_id)
-                       {
-                               stats->tabentry = (void *) &(tsmsg->m_entry[i]);
-                               return;
-                       }
+                       entry = &tsa->tsa_entries[i];
+                       if (entry->t_id == rel_id)
+                               return entry;
                }
 
-               if (tsmsg->m_nentries >= PGSTAT_NUM_TABENTRIES)
-                       continue;
-
-               /*
-                * Not found, but found a message buffer with an empty slot instead.
-                * Fine, let's use this one.
-                */
-               i = tsmsg->m_nentries++;
-               useent = &tsmsg->m_entry[i];
-               MemSet(useent, 0, sizeof(PgStat_TableEntry));
-               useent->t_id = rel_id;
-               stats->tabentry = (void *) useent;
-               return;
+               if (tsa->tsa_used < TABSTAT_QUANTUM)
+               {
+                       /*
+                        * It must not be present, but we found a free slot instead.
+                        * Fine, let's use this one.  We assume the entry was already
+                        * zeroed, either at creation or after last use.
+                        */
+                       entry = &tsa->tsa_entries[tsa->tsa_used++];
+                       entry->t_id = rel_id;
+                       entry->t_shared = isshared;
+                       return entry;
+               }
        }
 
        /*
-        * If we ran out of message buffers, we just allocate more.
+        * We ran out of tabstat slots, so allocate more.  Be sure they're zeroed.
         */
-       if (tsarr->tsa_used >= tsarr->tsa_alloc)
-               more_tabstat_space(tsarr);
+       tsa = (TabStatusArray *) MemoryContextAllocZero(TopMemoryContext,
+                                                                                                       sizeof(TabStatusArray));
+       if (prev_tsa)
+               prev_tsa->tsa_next = tsa;
+       else
+               pgStatTabList = tsa;
+
+       /*
+        * Use the first entry of the new TabStatusArray.
+        */
+       entry = &tsa->tsa_entries[tsa->tsa_used++];
+       entry->t_id = rel_id;
+       entry->t_shared = isshared;
+       return entry;
+}
+
+/*
+ * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed
+ */
+static PgStat_SubXactStatus *
+get_tabstat_stack_level(int nest_level)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       xact_state = pgStatXactStack;
+       if (xact_state == NULL || xact_state->nest_level != nest_level)
+       {
+               xact_state = (PgStat_SubXactStatus *)
+                       MemoryContextAlloc(TopTransactionContext,
+                                                          sizeof(PgStat_SubXactStatus));
+               xact_state->nest_level = nest_level;
+               xact_state->prev = pgStatXactStack;
+               xact_state->first = NULL;
+               pgStatXactStack = xact_state;
+       }
+       return xact_state;
+}
+
+/*
+ * add_tabstat_xact_level - add a new (sub)transaction state record
+ */
+static void
+add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
+{
+       PgStat_SubXactStatus *xact_state;
+       PgStat_TableXactStatus *trans;
 
        /*
-        * Use the first entry of the next message buffer.
+        * If this is the first rel to be modified at the current nest level,
+        * we first have to push a transaction stack entry.
         */
-       mb = tsarr->tsa_used++;
-       tsmsg = tsarr->tsa_messages[mb];
-       tsmsg->m_nentries = 1;
-       useent = &tsmsg->m_entry[0];
-       MemSet(useent, 0, sizeof(PgStat_TableEntry));
-       useent->t_id = rel_id;
-       stats->tabentry = (void *) useent;
+       xact_state = get_tabstat_stack_level(nest_level);
+
+       /* Now make a per-table stack entry */
+       trans = (PgStat_TableXactStatus *)
+               MemoryContextAllocZero(TopTransactionContext,
+                                                          sizeof(PgStat_TableXactStatus));
+       trans->nest_level = nest_level;
+       trans->upper = pgstat_info->trans;
+       trans->parent = pgstat_info;
+       trans->next = xact_state->first;
+       xact_state->first = trans;
+       pgstat_info->trans = trans;
+}
+
+/*
+ * pgstat_count_heap_insert - count a tuple insertion
+ */
+void
+pgstat_count_heap_insert(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+       {
+               int             nest_level = GetCurrentTransactionNestLevel();
+
+               /* t_tuples_inserted is nontransactional, so just advance it */
+               pgstat_info->t_counts.t_tuples_inserted++;
+
+               /* We have to log the transactional effect at the proper level */
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               pgstat_info->trans->tuples_inserted++;
+       }
+}
+
+/*
+ * pgstat_count_heap_update - count a tuple update
+ */
+void
+pgstat_count_heap_update(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+       {
+               int             nest_level = GetCurrentTransactionNestLevel();
+
+               /* t_tuples_updated is nontransactional, so just advance it */
+               pgstat_info->t_counts.t_tuples_updated++;
+
+               /* We have to log the transactional effect at the proper level */
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               /* An UPDATE both inserts a new tuple and deletes the old */
+               pgstat_info->trans->tuples_inserted++;
+               pgstat_info->trans->tuples_deleted++;
+       }
+}
+
+/*
+ * pgstat_count_heap_delete - count a tuple deletion
+ */
+void
+pgstat_count_heap_delete(Relation rel)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+       {
+               int             nest_level = GetCurrentTransactionNestLevel();
+
+               /* t_tuples_deleted is nontransactional, so just advance it */
+               pgstat_info->t_counts.t_tuples_deleted++;
+
+               /* We have to log the transactional effect at the proper level */
+               if (pgstat_info->trans == NULL ||
+                       pgstat_info->trans->nest_level != nest_level)
+                       add_tabstat_xact_level(pgstat_info, nest_level);
+
+               pgstat_info->trans->tuples_deleted++;
+       }
 }
 
 
 /* ----------
- * pgstat_count_xact_commit() -
+ * AtEOXact_PgStat
  *
- *     Called from access/transam/xact.c to count transaction commits.
+ *     Called from access/transam/xact.c at top-level transaction commit/abort.
  * ----------
  */
 void
-pgstat_count_xact_commit(void)
+AtEOXact_PgStat(bool isCommit)
 {
-       if (!pgstat_collect_tuplelevel &&
-               !pgstat_collect_blocklevel)
-               return;
-
-       pgStatXactCommit++;
+       PgStat_SubXactStatus *xact_state;
 
        /*
-        * If there was no relation activity yet, just make one existing message
-        * buffer used without slots, causing the next report to tell new
-        * xact-counters.
+        * Count transaction commit or abort.  (We use counters, not just bools,
+        * in case the reporting message isn't sent right away.)
         */
-       if (RegularTabStat.tsa_alloc == 0)
-               more_tabstat_space(&RegularTabStat);
+       if (isCommit)
+               pgStatXactCommit++;
+       else
+               pgStatXactRollback++;
 
-       if (RegularTabStat.tsa_used == 0)
+       /*
+        * Transfer transactional insert/update counts into the base tabstat
+        * entries.  We don't bother to free any of the transactional state,
+        * since it's all in TopTransactionContext and will go away anyway.
+        */
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
        {
-               RegularTabStat.tsa_used++;
-               RegularTabStat.tsa_messages[0]->m_nentries = 0;
+               PgStat_TableXactStatus *trans;
+
+               Assert(xact_state->nest_level == 1);
+               Assert(xact_state->prev == NULL);
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       Assert(trans->nest_level == 1);
+                       Assert(trans->upper == NULL);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+                       if (isCommit)
+                       {
+                               tabstat->t_counts.t_new_live_tuples += trans->tuples_inserted;
+                               tabstat->t_counts.t_new_dead_tuples += trans->tuples_deleted;
+                       }
+                       else
+                       {
+                               /* inserted tuples are dead, deleted tuples are unaffected */
+                               tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
+                       }
+                       tabstat->trans = NULL;
+               }
        }
-}
+       pgStatXactStack = NULL;
 
+       /* Make sure any stats snapshot is thrown away */
+       pgstat_clear_snapshot();
+}
 
 /* ----------
- * pgstat_count_xact_rollback() -
+ * AtEOSubXact_PgStat
  *
- *     Called from access/transam/xact.c to count transaction rollbacks.
+ *     Called from access/transam/xact.c at subtransaction commit/abort.
  * ----------
  */
 void
-pgstat_count_xact_rollback(void)
+AtEOSubXact_PgStat(bool isCommit, int nestDepth)
 {
-       if (!pgstat_collect_tuplelevel &&
-               !pgstat_collect_blocklevel)
-               return;
-
-       pgStatXactRollback++;
+       PgStat_SubXactStatus *xact_state;
 
        /*
-        * If there was no relation activity yet, just make one existing message
-        * buffer used without slots, causing the next report to tell new
-        * xact-counters.
+        * Transfer transactional insert/update counts into the next higher
+        * subtransaction state.
         */
-       if (RegularTabStat.tsa_alloc == 0)
-               more_tabstat_space(&RegularTabStat);
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL &&
+               xact_state->nest_level >= nestDepth)
+       {
+               PgStat_TableXactStatus *trans;
+               PgStat_TableXactStatus *next_trans;
+
+               /* delink xact_state from stack immediately to simplify reuse case */
+               pgStatXactStack = xact_state->prev;
+
+               for (trans = xact_state->first; trans != NULL; trans = next_trans)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       next_trans = trans->next;
+                       Assert(trans->nest_level == nestDepth);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+                       if (isCommit)
+                       {
+                               if (trans->upper && trans->upper->nest_level == nestDepth - 1)
+                               {
+                                       trans->upper->tuples_inserted += trans->tuples_inserted;
+                                       trans->upper->tuples_deleted += trans->tuples_deleted;
+                                       tabstat->trans = trans->upper;
+                                       pfree(trans);
+                               }
+                               else
+                               {
+                                       /*
+                                        * When there isn't an immediate parent state, we can
+                                        * just reuse the record instead of going through a
+                                        * palloc/pfree pushup (this works since it's all in
+                                        * TopTransactionContext anyway).  We have to re-link
+                                        * it into the parent level, though, and that might mean
+                                        * pushing a new entry into the pgStatXactStack.
+                                        */
+                                       PgStat_SubXactStatus *upper_xact_state;
+
+                                       upper_xact_state = get_tabstat_stack_level(nestDepth - 1);
+                                       trans->next = upper_xact_state->first;
+                                       upper_xact_state->first = trans;
+                                       trans->nest_level = nestDepth - 1;
+                               }
+                       }
+                       else
+                       {
+                               /*
+                                * On abort, inserted tuples are dead (and can be bounced out
+                                * to the top-level tabstat), deleted tuples are unaffected
+                                */
+                               tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
+                               tabstat->trans = trans->upper;
+                               pfree(trans);
+                       }
+               }
+               pfree(xact_state);
+       }
+}
+
 
-       if (RegularTabStat.tsa_used == 0)
+/*
+ * AtPrepare_PgStat
+ *             Save the transactional stats state at 2PC transaction prepare.
+ *
+ * In this phase we just generate 2PC records for all the pending
+ * transaction-dependent stats work.
+ */
+void
+AtPrepare_PgStat(void)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
        {
-               RegularTabStat.tsa_used++;
-               RegularTabStat.tsa_messages[0]->m_nentries = 0;
+               PgStat_TableXactStatus *trans;
+
+               Assert(xact_state->nest_level == 1);
+               Assert(xact_state->prev == NULL);
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+                       TwoPhasePgStatRecord record;
+
+                       Assert(trans->nest_level == 1);
+                       Assert(trans->upper == NULL);
+                       tabstat = trans->parent;
+                       Assert(tabstat->trans == trans);
+
+                       record.tuples_inserted = trans->tuples_inserted;
+                       record.tuples_deleted = trans->tuples_deleted;
+                       record.t_id = tabstat->t_id;
+                       record.t_shared = tabstat->t_shared;
+
+                       RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0,
+                                                                  &record, sizeof(TwoPhasePgStatRecord));
+               }
        }
 }
 
+/*
+ * PostPrepare_PgStat
+ *             Clean up after successful PREPARE.
+ *
+ * All we need do here is unlink the transaction stats state from the
+ * nontransactional state.  The nontransactional action counts will be
+ * reported to the stats collector immediately, while the effects on live
+ * and dead tuple counts are preserved in the 2PC state file.
+ *
+ * Note: AtEOXact_PgStat is not called during PREPARE.
+ */
+void
+PostPrepare_PgStat(void)
+{
+       PgStat_SubXactStatus *xact_state;
+
+       /*
+        * We don't bother to free any of the transactional state,
+        * since it's all in TopTransactionContext and will go away anyway.
+        */
+       xact_state = pgStatXactStack;
+       if (xact_state != NULL)
+       {
+               PgStat_TableXactStatus *trans;
+
+               for (trans = xact_state->first; trans != NULL; trans = trans->next)
+               {
+                       PgStat_TableStatus *tabstat;
+
+                       tabstat = trans->parent;
+                       tabstat->trans = NULL;
+               }
+       }
+       pgStatXactStack = NULL;
+
+       /* Make sure any stats snapshot is thrown away */
+       pgstat_clear_snapshot();
+}
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state.
+ */
+void
+pgstat_twophase_postcommit(TransactionId xid, uint16 info,
+                                                  void *recdata, uint32 len)
+{
+       TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+       PgStat_TableStatus *pgstat_info;
+
+       /* Find or create a tabstat entry for the rel */
+       pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+       pgstat_info->t_counts.t_new_live_tuples += rec->tuples_inserted;
+       pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_deleted;
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state, but treat them
+ * as aborted.
+ */
+void
+pgstat_twophase_postabort(TransactionId xid, uint16 info,
+                                                 void *recdata, uint32 len)
+{
+       TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+       PgStat_TableStatus *pgstat_info;
+
+       /* Find or create a tabstat entry for the rel */
+       pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+       /* inserted tuples are dead, deleted tuples are no-ops */
+       pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_inserted;
+}
+
 
 /* ----------
  * pgstat_fetch_stat_dbentry() -
@@ -1284,6 +1710,22 @@ pgstat_fetch_stat_numbackends(void)
        return localNumBackends;
 }
 
+/*
+ * ---------
+ * pgstat_fetch_global() -
+ *
+ *  Support function for the SQL-callable pgstat* functions. Returns
+ *  a pointer to the global statistics struct.
+ * ---------
+ */
+PgStat_GlobalStats *
+pgstat_fetch_global(void)
+{
+       backend_read_statsfile();
+
+       return &globalStats;
+}
+
 
 /* ------------------------------------------------------------
  * Functions for management of the shared-memory PgBackendStatus array
@@ -1418,7 +1860,7 @@ pgstat_beshutdown_hook(int code, Datum arg)
 {
        volatile PgBackendStatus *beentry = MyBEEntry;
 
-       pgstat_report_tabstat();
+       pgstat_report_tabstat(true);
 
        /*
         * Clear my status entry, following the protocol of bumping st_changecount
@@ -1642,6 +2084,38 @@ pgstat_send(void *msg, int len)
 #endif
 }
 
+/* ----------
+ * pgstat_send_bgwriter() -
+ *
+ *      Send bgwriter statistics to the collector
+ * ----------
+ */
+void
+pgstat_send_bgwriter(void)
+{
+       /* We assume this initializes to zeroes */
+       static const PgStat_MsgBgWriter all_zeroes;
+
+       /*
+        * This function can be called even if nothing at all has happened.
+        * In this case, avoid sending a completely empty message to
+        * the stats collector.
+        */
+       if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
+               return;
+
+       /*
+        * Prepare and send the message
+        */
+       pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
+       pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
+
+       /*
+        * Clear out the statistics buffer, so it can be re-used.
+        */
+       MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
+}
+
 
 /* ----------
  * PgstatCollectorMain() -
@@ -1716,7 +2190,7 @@ PgstatCollectorMain(int argc, char *argv[])
        /* Preset the delay between status file writes */
        MemSet(&write_timeout, 0, sizeof(struct itimerval));
        write_timeout.it_value.tv_sec = PGSTAT_STAT_INTERVAL / 1000;
-       write_timeout.it_value.tv_usec = PGSTAT_STAT_INTERVAL % 1000;
+       write_timeout.it_value.tv_usec = (PGSTAT_STAT_INTERVAL % 1000) * 1000;
 
        /*
         * Read in an existing statistics stats file or initialize the stats to
@@ -1888,6 +2362,10 @@ PgstatCollectorMain(int argc, char *argv[])
                                        pgstat_recv_analyze((PgStat_MsgAnalyze *) &msg, len);
                                        break;
 
+                               case PGSTAT_MTYPE_BGWRITER:
+                                       pgstat_recv_bgwriter((PgStat_MsgBgWriter *) &msg, len);
+                                       break;
+
                                default:
                                        break;
                        }
@@ -1970,6 +2448,11 @@ pgstat_get_db_entry(Oid databaseid, bool create)
                result->n_xact_rollback = 0;
                result->n_blocks_fetched = 0;
                result->n_blocks_hit = 0;
+               result->n_tuples_returned = 0;
+               result->n_tuples_fetched = 0;
+               result->n_tuples_inserted = 0;
+               result->n_tuples_updated = 0;
+               result->n_tuples_deleted = 0;
                result->last_autovac_time = 0;
 
                memset(&hash_ctl, 0, sizeof(hash_ctl));
@@ -2021,6 +2504,11 @@ pgstat_write_statsfile(void)
        format_id = PGSTAT_FILE_FORMAT_ID;
        fwrite(&format_id, sizeof(format_id), 1, fpout);
 
+       /*
+        * Write global stats struct
+        */
+       fwrite(&globalStats, sizeof(globalStats), 1, fpout);
+
        /*
         * Walk through the database table.
         */
@@ -2123,6 +2611,12 @@ pgstat_read_statsfile(Oid onlydb)
        dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
                                                 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 
+       /*
+        * Clear out global statistics so they start from zero in case we can't
+        * load an existing statsfile.
+        */
+       memset(&globalStats, 0, sizeof(globalStats));
+
        /*
         * Try to open the status file. If it doesn't exist, the backends simply
         * return zero for anything and the collector simply starts from scratch
@@ -2142,6 +2636,16 @@ pgstat_read_statsfile(Oid onlydb)
                goto done;
        }
 
+       /*
+        * Read global stats struct
+        */
+       if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
+       {
+               ereport(pgStatRunningInCollector ? LOG : WARNING,
+                               (errmsg("corrupted pgstat.stat file")));
+               goto done;
+       }
+
        /*
         * We found an existing collector stats file. Read it and put all the
         * hashtable entries into place.
@@ -2319,10 +2823,6 @@ pgstat_setup_memcxt(void)
 void
 pgstat_clear_snapshot(void)
 {
-       /* In an autovacuum worker process we keep the stats forever */
-       if (IsAutoVacuumWorkerProcess())
-               return;
-
        /* Release memory, if any was allocated */
        if (pgStatLocalContext)
                MemoryContextDelete(pgStatLocalContext);
@@ -2373,51 +2873,50 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
                         * If it's a new table entry, initialize counters to the values we
                         * just got.
                         */
-                       tabentry->numscans = tabmsg[i].t_numscans;
-                       tabentry->tuples_returned = tabmsg[i].t_tuples_returned;
-                       tabentry->tuples_fetched = tabmsg[i].t_tuples_fetched;
-                       tabentry->tuples_inserted = tabmsg[i].t_tuples_inserted;
-                       tabentry->tuples_updated = tabmsg[i].t_tuples_updated;
-                       tabentry->tuples_deleted = tabmsg[i].t_tuples_deleted;
-
-                       tabentry->n_live_tuples = tabmsg[i].t_tuples_inserted;
-                       tabentry->n_dead_tuples = tabmsg[i].t_tuples_updated +
-                               tabmsg[i].t_tuples_deleted;
+                       tabentry->numscans = tabmsg[i].t_counts.t_numscans;
+                       tabentry->tuples_returned = tabmsg[i].t_counts.t_tuples_returned;
+                       tabentry->tuples_fetched = tabmsg[i].t_counts.t_tuples_fetched;
+                       tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted;
+                       tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated;
+                       tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted;
+                       tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples;
+                       tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples;
+                       tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched;
+                       tabentry->blocks_hit = tabmsg[i].t_counts.t_blocks_hit;
+
                        tabentry->last_anl_tuples = 0;
                        tabentry->vacuum_timestamp = 0;
                        tabentry->autovac_vacuum_timestamp = 0;
                        tabentry->analyze_timestamp = 0;
                        tabentry->autovac_analyze_timestamp = 0;
-
-                       tabentry->blocks_fetched = tabmsg[i].t_blocks_fetched;
-                       tabentry->blocks_hit = tabmsg[i].t_blocks_hit;
                }
                else
                {
                        /*
                         * Otherwise add the values to the existing entry.
                         */
-                       tabentry->numscans += tabmsg[i].t_numscans;
-                       tabentry->tuples_returned += tabmsg[i].t_tuples_returned;
-                       tabentry->tuples_fetched += tabmsg[i].t_tuples_fetched;
-                       tabentry->tuples_inserted += tabmsg[i].t_tuples_inserted;
-                       tabentry->tuples_updated += tabmsg[i].t_tuples_updated;
-                       tabentry->tuples_deleted += tabmsg[i].t_tuples_deleted;
-
-                       tabentry->n_live_tuples += tabmsg[i].t_tuples_inserted -
-                               tabmsg[i].t_tuples_deleted;
-                       tabentry->n_dead_tuples += tabmsg[i].t_tuples_updated +
-                               tabmsg[i].t_tuples_deleted;
-
-                       tabentry->blocks_fetched += tabmsg[i].t_blocks_fetched;
-                       tabentry->blocks_hit += tabmsg[i].t_blocks_hit;
+                       tabentry->numscans += tabmsg[i].t_counts.t_numscans;
+                       tabentry->tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
+                       tabentry->tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
+                       tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
+                       tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
+                       tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+                       tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples;
+                       tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples;
+                       tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
+                       tabentry->blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
                }
 
                /*
-                * And add the block IO to the database entry.
+                * Add per-table stats to the per-database entry, too.
                 */
-               dbentry->n_blocks_fetched += tabmsg[i].t_blocks_fetched;
-               dbentry->n_blocks_hit += tabmsg[i].t_blocks_hit;
+               dbentry->n_tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
+               dbentry->n_tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
+               dbentry->n_tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
+               dbentry->n_tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
+               dbentry->n_tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+               dbentry->n_blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
+               dbentry->n_blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
        }
 }
 
@@ -2642,3 +3141,22 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
        tabentry->n_dead_tuples = msg->m_dead_tuples;
        tabentry->last_anl_tuples = msg->m_live_tuples + msg->m_dead_tuples;
 }
+
+
+/* ----------
+ * pgstat_recv_bgwriter() -
+ *
+ *     Process a BGWRITER message.
+ * ----------
+ */
+static void
+pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
+{
+       globalStats.timed_checkpoints += msg->m_timed_checkpoints;
+       globalStats.requested_checkpoints += msg->m_requested_checkpoints;
+       globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
+       globalStats.buf_written_lru += msg->m_buf_written_lru;
+       globalStats.buf_written_all += msg->m_buf_written_all;
+       globalStats.maxwritten_lru += msg->m_maxwritten_lru;
+       globalStats.maxwritten_all += msg->m_maxwritten_all;
+}