*
* Copyright (c) 2001-2007, PostgreSQL Global Development Group
*
- * $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.149 2007/03/16 17:57:36 mha Exp $
+ * $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.156 2007/05/27 03:50:39 tgl Exp $
* ----------
*/
#include "postgres.h"
#include "access/heapam.h"
#include "access/transam.h"
+#include "access/twophase_rmgr.h"
#include "access/xact.h"
#include "catalog/pg_database.h"
#include "libpq/ip.h"
bool pgstat_collect_blocklevel = false;
bool pgstat_collect_querystring = false;
+/*
+ * BgWriter global statistics counters (unused in other processes).
+ * Stored directly in a stats message structure so it can be sent
+ * without needing to copy things around. We assume this inits to zeroes.
+ */
+PgStat_MsgBgWriter BgWriterStats;
+
/* ----------
* Local data
* ----------
static bool pgStatRunningInCollector = false;
/*
- * Place where backends store per-table info to be sent to the collector.
- * We store shared relations separately from non-shared ones, to be able to
- * send them in separate messages.
+ * Structures in which backends store per-table info that's waiting to be
+ * sent to the collector.
+ *
+ * NOTE: once allocated, TabStatusArray structures are never moved or deleted
+ * for the life of the backend. Also, we zero out the t_id fields of the
+ * contained PgStat_TableStatus structs whenever they are not actively in use.
+ * This allows relcache pgstat_info pointers to be treated as long-lived data,
+ * avoiding repeated searches in pgstat_initstats() when a relation is
+ * repeatedly opened during a transaction.
*/
-typedef struct TabStatArray
+#define TABSTAT_QUANTUM 100 /* we alloc this many at a time */
+
+typedef struct TabStatusArray
{
- int tsa_alloc; /* num allocated */
- int tsa_used; /* num actually used */
- PgStat_MsgTabstat **tsa_messages; /* the array itself */
-} TabStatArray;
+ struct TabStatusArray *tsa_next; /* link to next array, if any */
+ int tsa_used; /* # entries currently used */
+ PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM]; /* per-table data */
+} TabStatusArray;
-#define TABSTAT_QUANTUM 4 /* we alloc this many at a time */
+static TabStatusArray *pgStatTabList = NULL;
-static TabStatArray RegularTabStat = {0, 0, NULL};
-static TabStatArray SharedTabStat = {0, 0, NULL};
+/*
+ * Tuple insertion/deletion counts for an open transaction can't be propagated
+ * into PgStat_TableStatus counters until we know if it is going to commit
+ * or abort. Hence, we keep these counts in per-subxact structs that live
+ * in TopTransactionContext. This data structure is designed on the assumption
+ * that subxacts won't usually modify very many tables.
+ */
+typedef struct PgStat_SubXactStatus
+{
+ int nest_level; /* subtransaction nest level */
+ struct PgStat_SubXactStatus *prev; /* higher-level subxact if any */
+ PgStat_TableXactStatus *first; /* head of list for this subxact */
+} PgStat_SubXactStatus;
+
+static PgStat_SubXactStatus *pgStatXactStack = NULL;
static int pgStatXactCommit = 0;
static int pgStatXactRollback = 0;
+/* Record that's written to 2PC state file when pgstat state is persisted */
+typedef struct TwoPhasePgStatRecord
+{
+ PgStat_Counter tuples_inserted; /* tuples inserted in xact */
+ PgStat_Counter tuples_deleted; /* tuples deleted in xact */
+ Oid t_id; /* table's OID */
+ bool t_shared; /* is it a shared catalog? */
+} TwoPhasePgStatRecord;
+
+/*
+ * Info about current "snapshot" of stats file
+ */
static MemoryContext pgStatLocalContext = NULL;
static HTAB *pgStatDBHash = NULL;
static PgBackendStatus *localBackendStatusTable = NULL;
static int localNumBackends = 0;
+/*
+ * Cluster wide statistics, kept in the stats collector.
+ * Contains statistics that are not collected per database
+ * or per table.
+ */
+static PgStat_GlobalStats globalStats;
+
static volatile bool need_exit = false;
static volatile bool need_statwrite = false;
static HTAB *pgstat_read_statsfile(Oid onlydb);
static void backend_read_statsfile(void);
static void pgstat_read_current_status(void);
+
+static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
static HTAB *pgstat_collect_oids(Oid catalogid);
+static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
+
static void pgstat_setup_memcxt(void);
static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
+static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
/* ------------------------------------------------------------
return 0;
}
+void allow_immediate_pgstat_restart(void)
+{
+ last_pgstat_start_time = 0;
+}
/* ------------------------------------------------------------
* Public functions used by backends follow
/* ----------
* pgstat_report_tabstat() -
*
- * Called from tcop/postgres.c to send the so far collected
- * per table access statistics to the collector.
+ * Called from tcop/postgres.c to send the so far collected per-table
+ * access statistics to the collector. Note that this is called only
+ * when not within a transaction, so it is fair to use transaction stop
+ * time as an approximation of current time.
* ----------
*/
void
-pgstat_report_tabstat(void)
+pgstat_report_tabstat(bool force)
{
+ /* we assume this inits to all zeroes: */
+ static const PgStat_TableCounts all_zeroes;
+ static TimestampTz last_report = 0;
+
+ TimestampTz now;
+ PgStat_MsgTabstat regular_msg;
+ PgStat_MsgTabstat shared_msg;
+ TabStatusArray *tsa;
int i;
- if (pgStatSock < 0 ||
- (!pgstat_collect_tuplelevel &&
- !pgstat_collect_blocklevel))
- {
- /* Not reporting stats, so just flush whatever we have */
- RegularTabStat.tsa_used = 0;
- SharedTabStat.tsa_used = 0;
+ /* Don't expend a clock check if nothing to do */
+ if (pgStatTabList == NULL ||
+ pgStatTabList->tsa_used == 0)
return;
- }
/*
- * For each message buffer used during the last query set the header
- * fields and send it out.
+ * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
+ * msec since we last sent one, or the caller wants to force stats out.
+ */
+ now = GetCurrentTransactionStopTimestamp();
+ if (!force &&
+ !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL))
+ return;
+ last_report = now;
+
+ /*
+ * Scan through the TabStatusArray struct(s) to find tables that actually
+ * have counts, and build messages to send. We have to separate shared
+ * relations from regular ones because the databaseid field in the
+ * message header has to depend on that.
*/
- for (i = 0; i < RegularTabStat.tsa_used; i++)
+ regular_msg.m_databaseid = MyDatabaseId;
+ shared_msg.m_databaseid = InvalidOid;
+ regular_msg.m_nentries = 0;
+ shared_msg.m_nentries = 0;
+
+ for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
{
- PgStat_MsgTabstat *tsmsg = RegularTabStat.tsa_messages[i];
- int n;
- int len;
+ for (i = 0; i < tsa->tsa_used; i++)
+ {
+ PgStat_TableStatus *entry = &tsa->tsa_entries[i];
+ PgStat_MsgTabstat *this_msg;
+ PgStat_TableEntry *this_ent;
+
+ /* Shouldn't have any pending transaction-dependent counts */
+ Assert(entry->trans == NULL);
- n = tsmsg->m_nentries;
- len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
- n * sizeof(PgStat_TableEntry);
+ /*
+ * Ignore entries that didn't accumulate any actual counts,
+ * such as indexes that were opened by the planner but not used.
+ */
+ if (memcmp(&entry->t_counts, &all_zeroes,
+ sizeof(PgStat_TableCounts)) == 0)
+ continue;
+ /*
+ * OK, insert data into the appropriate message, and send if full.
+ */
+ this_msg = entry->t_shared ? &shared_msg : ®ular_msg;
+ this_ent = &this_msg->m_entry[this_msg->m_nentries];
+ this_ent->t_id = entry->t_id;
+ memcpy(&this_ent->t_counts, &entry->t_counts,
+ sizeof(PgStat_TableCounts));
+ if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
+ {
+ pgstat_send_tabstat(this_msg);
+ this_msg->m_nentries = 0;
+ }
+ }
+ /* zero out TableStatus structs after use */
+ MemSet(tsa->tsa_entries, 0,
+ tsa->tsa_used * sizeof(PgStat_TableStatus));
+ tsa->tsa_used = 0;
+ }
+ /*
+ * Send partial messages. If force is true, make sure that any pending
+ * xact commit/abort gets counted, even if no table stats to send.
+ */
+ if (regular_msg.m_nentries > 0 ||
+ (force && (pgStatXactCommit > 0 || pgStatXactRollback > 0)))
+ pgstat_send_tabstat(®ular_msg);
+ if (shared_msg.m_nentries > 0)
+ pgstat_send_tabstat(&shared_msg);
+}
+
+/*
+ * Subroutine for pgstat_report_tabstat: finish and send a tabstat message
+ */
+static void
+pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
+{
+ int n;
+ int len;
+
+ /* It's unlikely we'd get here with no socket, but maybe not impossible */
+ if (pgStatSock < 0)
+ return;
+
+ /*
+ * Report accumulated xact commit/rollback whenever we send a normal
+ * tabstat message
+ */
+ if (OidIsValid(tsmsg->m_databaseid))
+ {
tsmsg->m_xact_commit = pgStatXactCommit;
tsmsg->m_xact_rollback = pgStatXactRollback;
pgStatXactCommit = 0;
pgStatXactRollback = 0;
-
- pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
- tsmsg->m_databaseid = MyDatabaseId;
- pgstat_send(tsmsg, len);
}
- RegularTabStat.tsa_used = 0;
-
- /* Ditto, for shared relations */
- for (i = 0; i < SharedTabStat.tsa_used; i++)
+ else
{
- PgStat_MsgTabstat *tsmsg = SharedTabStat.tsa_messages[i];
- int n;
- int len;
-
- n = tsmsg->m_nentries;
- len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
- n * sizeof(PgStat_TableEntry);
-
- /* We don't report transaction commit/abort here */
tsmsg->m_xact_commit = 0;
tsmsg->m_xact_rollback = 0;
-
- pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
- tsmsg->m_databaseid = InvalidOid;
- pgstat_send(tsmsg, len);
}
- SharedTabStat.tsa_used = 0;
+
+ n = tsmsg->m_nentries;
+ len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
+ n * sizeof(PgStat_TableEntry);
+
+ pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
+ pgstat_send(tsmsg, len);
}
pgstat_send(&msg, sizeof(msg));
}
-/*
- * Enlarge a TabStatArray
- */
-static void
-more_tabstat_space(TabStatArray *tsarr)
-{
- PgStat_MsgTabstat *newMessages;
- PgStat_MsgTabstat **msgArray;
- int newAlloc;
- int i;
-
- AssertArg(PointerIsValid(tsarr));
-
- newAlloc = tsarr->tsa_alloc + TABSTAT_QUANTUM;
-
- /* Create (another) quantum of message buffers */
- newMessages = (PgStat_MsgTabstat *)
- MemoryContextAllocZero(TopMemoryContext,
- sizeof(PgStat_MsgTabstat) * TABSTAT_QUANTUM);
-
- /* Create or enlarge the pointer array */
- if (tsarr->tsa_messages == NULL)
- msgArray = (PgStat_MsgTabstat **)
- MemoryContextAlloc(TopMemoryContext,
- sizeof(PgStat_MsgTabstat *) * newAlloc);
- else
- msgArray = (PgStat_MsgTabstat **)
- repalloc(tsarr->tsa_messages,
- sizeof(PgStat_MsgTabstat *) * newAlloc);
-
- for (i = 0; i < TABSTAT_QUANTUM; i++)
- msgArray[tsarr->tsa_alloc + i] = newMessages++;
- tsarr->tsa_messages = msgArray;
- tsarr->tsa_alloc = newAlloc;
-
- Assert(tsarr->tsa_used < tsarr->tsa_alloc);
-}
/* ----------
* pgstat_initstats() -
*
- * Called from various places usually dealing with initialization
- * of Relation or Scan structures. The data placed into these
- * structures from here tell where later to count for buffer reads,
- * scans and tuples fetched.
+ * Initialize a relcache entry to count access statistics.
+ * Called whenever a relation is opened.
+ *
+ * We assume that a relcache entry's pgstat_info field is zeroed by
+ * relcache.c when the relcache entry is made; thereafter it is long-lived
+ * data. We can avoid repeated searches of the TabStatus arrays when the
+ * same relation is touched repeatedly within a transaction.
* ----------
*/
void
-pgstat_initstats(PgStat_Info *stats, Relation rel)
+pgstat_initstats(Relation rel)
{
Oid rel_id = rel->rd_id;
- PgStat_TableEntry *useent;
- TabStatArray *tsarr;
- PgStat_MsgTabstat *tsmsg;
- int mb;
- int i;
+ char relkind = rel->rd_rel->relkind;
- /*
- * Initialize data not to count at all.
- */
- stats->tabentry = NULL;
+ /* We only count stats for things that have storage */
+ if (!(relkind == RELKIND_RELATION ||
+ relkind == RELKIND_INDEX ||
+ relkind == RELKIND_TOASTVALUE))
+ {
+ rel->pgstat_info = NULL;
+ return;
+ }
if (pgStatSock < 0 ||
!(pgstat_collect_tuplelevel ||
pgstat_collect_blocklevel))
+ {
+ /* We're not counting at all */
+ rel->pgstat_info = NULL;
return;
+ }
- tsarr = rel->rd_rel->relisshared ? &SharedTabStat : &RegularTabStat;
+ /*
+ * If we already set up this relation in the current transaction,
+ * nothing to do.
+ */
+ if (rel->pgstat_info != NULL &&
+ rel->pgstat_info->t_id == rel_id)
+ return;
+
+ /* Else find or make the PgStat_TableStatus entry, and update link */
+ rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
+}
+
+/*
+ * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
+ */
+static PgStat_TableStatus *
+get_tabstat_entry(Oid rel_id, bool isshared)
+{
+ PgStat_TableStatus *entry;
+ TabStatusArray *tsa;
+ TabStatusArray *prev_tsa;
+ int i;
/*
- * Search the already-used message slots for this relation.
+ * Search the already-used tabstat slots for this relation.
*/
- for (mb = 0; mb < tsarr->tsa_used; mb++)
+ prev_tsa = NULL;
+ for (tsa = pgStatTabList; tsa != NULL; prev_tsa = tsa, tsa = tsa->tsa_next)
{
- tsmsg = tsarr->tsa_messages[mb];
-
- for (i = tsmsg->m_nentries; --i >= 0;)
+ for (i = 0; i < tsa->tsa_used; i++)
{
- if (tsmsg->m_entry[i].t_id == rel_id)
- {
- stats->tabentry = (void *) &(tsmsg->m_entry[i]);
- return;
- }
+ entry = &tsa->tsa_entries[i];
+ if (entry->t_id == rel_id)
+ return entry;
}
- if (tsmsg->m_nentries >= PGSTAT_NUM_TABENTRIES)
- continue;
-
- /*
- * Not found, but found a message buffer with an empty slot instead.
- * Fine, let's use this one.
- */
- i = tsmsg->m_nentries++;
- useent = &tsmsg->m_entry[i];
- MemSet(useent, 0, sizeof(PgStat_TableEntry));
- useent->t_id = rel_id;
- stats->tabentry = (void *) useent;
- return;
+ if (tsa->tsa_used < TABSTAT_QUANTUM)
+ {
+ /*
+ * It must not be present, but we found a free slot instead.
+ * Fine, let's use this one. We assume the entry was already
+ * zeroed, either at creation or after last use.
+ */
+ entry = &tsa->tsa_entries[tsa->tsa_used++];
+ entry->t_id = rel_id;
+ entry->t_shared = isshared;
+ return entry;
+ }
}
/*
- * If we ran out of message buffers, we just allocate more.
+ * We ran out of tabstat slots, so allocate more. Be sure they're zeroed.
+ */
+ tsa = (TabStatusArray *) MemoryContextAllocZero(TopMemoryContext,
+ sizeof(TabStatusArray));
+ if (prev_tsa)
+ prev_tsa->tsa_next = tsa;
+ else
+ pgStatTabList = tsa;
+
+ /*
+ * Use the first entry of the new TabStatusArray.
*/
- if (tsarr->tsa_used >= tsarr->tsa_alloc)
- more_tabstat_space(tsarr);
+ entry = &tsa->tsa_entries[tsa->tsa_used++];
+ entry->t_id = rel_id;
+ entry->t_shared = isshared;
+ return entry;
+}
+
+/*
+ * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed
+ */
+static PgStat_SubXactStatus *
+get_tabstat_stack_level(int nest_level)
+{
+ PgStat_SubXactStatus *xact_state;
+
+ xact_state = pgStatXactStack;
+ if (xact_state == NULL || xact_state->nest_level != nest_level)
+ {
+ xact_state = (PgStat_SubXactStatus *)
+ MemoryContextAlloc(TopTransactionContext,
+ sizeof(PgStat_SubXactStatus));
+ xact_state->nest_level = nest_level;
+ xact_state->prev = pgStatXactStack;
+ xact_state->first = NULL;
+ pgStatXactStack = xact_state;
+ }
+ return xact_state;
+}
+
+/*
+ * add_tabstat_xact_level - add a new (sub)transaction state record
+ */
+static void
+add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
+{
+ PgStat_SubXactStatus *xact_state;
+ PgStat_TableXactStatus *trans;
/*
- * Use the first entry of the next message buffer.
+ * If this is the first rel to be modified at the current nest level,
+ * we first have to push a transaction stack entry.
*/
- mb = tsarr->tsa_used++;
- tsmsg = tsarr->tsa_messages[mb];
- tsmsg->m_nentries = 1;
- useent = &tsmsg->m_entry[0];
- MemSet(useent, 0, sizeof(PgStat_TableEntry));
- useent->t_id = rel_id;
- stats->tabentry = (void *) useent;
+ xact_state = get_tabstat_stack_level(nest_level);
+
+ /* Now make a per-table stack entry */
+ trans = (PgStat_TableXactStatus *)
+ MemoryContextAllocZero(TopTransactionContext,
+ sizeof(PgStat_TableXactStatus));
+ trans->nest_level = nest_level;
+ trans->upper = pgstat_info->trans;
+ trans->parent = pgstat_info;
+ trans->next = xact_state->first;
+ xact_state->first = trans;
+ pgstat_info->trans = trans;
+}
+
+/*
+ * pgstat_count_heap_insert - count a tuple insertion
+ */
+void
+pgstat_count_heap_insert(Relation rel)
+{
+ PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+ if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+ {
+ int nest_level = GetCurrentTransactionNestLevel();
+
+ /* t_tuples_inserted is nontransactional, so just advance it */
+ pgstat_info->t_counts.t_tuples_inserted++;
+
+ /* We have to log the transactional effect at the proper level */
+ if (pgstat_info->trans == NULL ||
+ pgstat_info->trans->nest_level != nest_level)
+ add_tabstat_xact_level(pgstat_info, nest_level);
+
+ pgstat_info->trans->tuples_inserted++;
+ }
+}
+
+/*
+ * pgstat_count_heap_update - count a tuple update
+ */
+void
+pgstat_count_heap_update(Relation rel)
+{
+ PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+ if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+ {
+ int nest_level = GetCurrentTransactionNestLevel();
+
+ /* t_tuples_updated is nontransactional, so just advance it */
+ pgstat_info->t_counts.t_tuples_updated++;
+
+ /* We have to log the transactional effect at the proper level */
+ if (pgstat_info->trans == NULL ||
+ pgstat_info->trans->nest_level != nest_level)
+ add_tabstat_xact_level(pgstat_info, nest_level);
+
+ /* An UPDATE both inserts a new tuple and deletes the old */
+ pgstat_info->trans->tuples_inserted++;
+ pgstat_info->trans->tuples_deleted++;
+ }
+}
+
+/*
+ * pgstat_count_heap_delete - count a tuple deletion
+ */
+void
+pgstat_count_heap_delete(Relation rel)
+{
+ PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+ if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+ {
+ int nest_level = GetCurrentTransactionNestLevel();
+
+ /* t_tuples_deleted is nontransactional, so just advance it */
+ pgstat_info->t_counts.t_tuples_deleted++;
+
+ /* We have to log the transactional effect at the proper level */
+ if (pgstat_info->trans == NULL ||
+ pgstat_info->trans->nest_level != nest_level)
+ add_tabstat_xact_level(pgstat_info, nest_level);
+
+ pgstat_info->trans->tuples_deleted++;
+ }
}
/* ----------
- * pgstat_count_xact_commit() -
+ * AtEOXact_PgStat
*
- * Called from access/transam/xact.c to count transaction commits.
+ * Called from access/transam/xact.c at top-level transaction commit/abort.
* ----------
*/
void
-pgstat_count_xact_commit(void)
+AtEOXact_PgStat(bool isCommit)
{
- if (!pgstat_collect_tuplelevel &&
- !pgstat_collect_blocklevel)
- return;
-
- pgStatXactCommit++;
+ PgStat_SubXactStatus *xact_state;
/*
- * If there was no relation activity yet, just make one existing message
- * buffer used without slots, causing the next report to tell new
- * xact-counters.
+ * Count transaction commit or abort. (We use counters, not just bools,
+ * in case the reporting message isn't sent right away.)
*/
- if (RegularTabStat.tsa_alloc == 0)
- more_tabstat_space(&RegularTabStat);
+ if (isCommit)
+ pgStatXactCommit++;
+ else
+ pgStatXactRollback++;
- if (RegularTabStat.tsa_used == 0)
+ /*
+ * Transfer transactional insert/update counts into the base tabstat
+ * entries. We don't bother to free any of the transactional state,
+ * since it's all in TopTransactionContext and will go away anyway.
+ */
+ xact_state = pgStatXactStack;
+ if (xact_state != NULL)
{
- RegularTabStat.tsa_used++;
- RegularTabStat.tsa_messages[0]->m_nentries = 0;
+ PgStat_TableXactStatus *trans;
+
+ Assert(xact_state->nest_level == 1);
+ Assert(xact_state->prev == NULL);
+ for (trans = xact_state->first; trans != NULL; trans = trans->next)
+ {
+ PgStat_TableStatus *tabstat;
+
+ Assert(trans->nest_level == 1);
+ Assert(trans->upper == NULL);
+ tabstat = trans->parent;
+ Assert(tabstat->trans == trans);
+ if (isCommit)
+ {
+ tabstat->t_counts.t_new_live_tuples += trans->tuples_inserted;
+ tabstat->t_counts.t_new_dead_tuples += trans->tuples_deleted;
+ }
+ else
+ {
+ /* inserted tuples are dead, deleted tuples are unaffected */
+ tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
+ }
+ tabstat->trans = NULL;
+ }
}
-}
+ pgStatXactStack = NULL;
+ /* Make sure any stats snapshot is thrown away */
+ pgstat_clear_snapshot();
+}
/* ----------
- * pgstat_count_xact_rollback() -
+ * AtEOSubXact_PgStat
*
- * Called from access/transam/xact.c to count transaction rollbacks.
+ * Called from access/transam/xact.c at subtransaction commit/abort.
* ----------
*/
void
-pgstat_count_xact_rollback(void)
+AtEOSubXact_PgStat(bool isCommit, int nestDepth)
{
- if (!pgstat_collect_tuplelevel &&
- !pgstat_collect_blocklevel)
- return;
-
- pgStatXactRollback++;
+ PgStat_SubXactStatus *xact_state;
/*
- * If there was no relation activity yet, just make one existing message
- * buffer used without slots, causing the next report to tell new
- * xact-counters.
+ * Transfer transactional insert/update counts into the next higher
+ * subtransaction state.
*/
- if (RegularTabStat.tsa_alloc == 0)
- more_tabstat_space(&RegularTabStat);
+ xact_state = pgStatXactStack;
+ if (xact_state != NULL &&
+ xact_state->nest_level >= nestDepth)
+ {
+ PgStat_TableXactStatus *trans;
+ PgStat_TableXactStatus *next_trans;
+
+ /* delink xact_state from stack immediately to simplify reuse case */
+ pgStatXactStack = xact_state->prev;
+
+ for (trans = xact_state->first; trans != NULL; trans = next_trans)
+ {
+ PgStat_TableStatus *tabstat;
- if (RegularTabStat.tsa_used == 0)
+ next_trans = trans->next;
+ Assert(trans->nest_level == nestDepth);
+ tabstat = trans->parent;
+ Assert(tabstat->trans == trans);
+ if (isCommit)
+ {
+ if (trans->upper && trans->upper->nest_level == nestDepth - 1)
+ {
+ trans->upper->tuples_inserted += trans->tuples_inserted;
+ trans->upper->tuples_deleted += trans->tuples_deleted;
+ tabstat->trans = trans->upper;
+ pfree(trans);
+ }
+ else
+ {
+ /*
+ * When there isn't an immediate parent state, we can
+ * just reuse the record instead of going through a
+ * palloc/pfree pushup (this works since it's all in
+ * TopTransactionContext anyway). We have to re-link
+ * it into the parent level, though, and that might mean
+ * pushing a new entry into the pgStatXactStack.
+ */
+ PgStat_SubXactStatus *upper_xact_state;
+
+ upper_xact_state = get_tabstat_stack_level(nestDepth - 1);
+ trans->next = upper_xact_state->first;
+ upper_xact_state->first = trans;
+ trans->nest_level = nestDepth - 1;
+ }
+ }
+ else
+ {
+ /*
+ * On abort, inserted tuples are dead (and can be bounced out
+ * to the top-level tabstat), deleted tuples are unaffected
+ */
+ tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
+ tabstat->trans = trans->upper;
+ pfree(trans);
+ }
+ }
+ pfree(xact_state);
+ }
+}
+
+
+/*
+ * AtPrepare_PgStat
+ * Save the transactional stats state at 2PC transaction prepare.
+ *
+ * In this phase we just generate 2PC records for all the pending
+ * transaction-dependent stats work.
+ */
+void
+AtPrepare_PgStat(void)
+{
+ PgStat_SubXactStatus *xact_state;
+
+ xact_state = pgStatXactStack;
+ if (xact_state != NULL)
{
- RegularTabStat.tsa_used++;
- RegularTabStat.tsa_messages[0]->m_nentries = 0;
+ PgStat_TableXactStatus *trans;
+
+ Assert(xact_state->nest_level == 1);
+ Assert(xact_state->prev == NULL);
+ for (trans = xact_state->first; trans != NULL; trans = trans->next)
+ {
+ PgStat_TableStatus *tabstat;
+ TwoPhasePgStatRecord record;
+
+ Assert(trans->nest_level == 1);
+ Assert(trans->upper == NULL);
+ tabstat = trans->parent;
+ Assert(tabstat->trans == trans);
+
+ record.tuples_inserted = trans->tuples_inserted;
+ record.tuples_deleted = trans->tuples_deleted;
+ record.t_id = tabstat->t_id;
+ record.t_shared = tabstat->t_shared;
+
+ RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0,
+ &record, sizeof(TwoPhasePgStatRecord));
+ }
}
}
+/*
+ * PostPrepare_PgStat
+ * Clean up after successful PREPARE.
+ *
+ * All we need do here is unlink the transaction stats state from the
+ * nontransactional state. The nontransactional action counts will be
+ * reported to the stats collector immediately, while the effects on live
+ * and dead tuple counts are preserved in the 2PC state file.
+ *
+ * Note: AtEOXact_PgStat is not called during PREPARE.
+ */
+void
+PostPrepare_PgStat(void)
+{
+ PgStat_SubXactStatus *xact_state;
+
+ /*
+ * We don't bother to free any of the transactional state,
+ * since it's all in TopTransactionContext and will go away anyway.
+ */
+ xact_state = pgStatXactStack;
+ if (xact_state != NULL)
+ {
+ PgStat_TableXactStatus *trans;
+
+ for (trans = xact_state->first; trans != NULL; trans = trans->next)
+ {
+ PgStat_TableStatus *tabstat;
+
+ tabstat = trans->parent;
+ tabstat->trans = NULL;
+ }
+ }
+ pgStatXactStack = NULL;
+
+ /* Make sure any stats snapshot is thrown away */
+ pgstat_clear_snapshot();
+}
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state.
+ */
+void
+pgstat_twophase_postcommit(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+ PgStat_TableStatus *pgstat_info;
+
+ /* Find or create a tabstat entry for the rel */
+ pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+ pgstat_info->t_counts.t_new_live_tuples += rec->tuples_inserted;
+ pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_deleted;
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state, but treat them
+ * as aborted.
+ */
+void
+pgstat_twophase_postabort(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+ PgStat_TableStatus *pgstat_info;
+
+ /* Find or create a tabstat entry for the rel */
+ pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+ /* inserted tuples are dead, deleted tuples are no-ops */
+ pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_inserted;
+}
+
/* ----------
* pgstat_fetch_stat_dbentry() -
return localNumBackends;
}
+/*
+ * ---------
+ * pgstat_fetch_global() -
+ *
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * a pointer to the global statistics struct.
+ * ---------
+ */
+PgStat_GlobalStats *
+pgstat_fetch_global(void)
+{
+ backend_read_statsfile();
+
+ return &globalStats;
+}
+
/* ------------------------------------------------------------
* Functions for management of the shared-memory PgBackendStatus array
{
volatile PgBackendStatus *beentry = MyBEEntry;
- pgstat_report_tabstat();
+ pgstat_report_tabstat(true);
/*
* Clear my status entry, following the protocol of bumping st_changecount
#endif
}
+/* ----------
+ * pgstat_send_bgwriter() -
+ *
+ * Send bgwriter statistics to the collector
+ * ----------
+ */
+void
+pgstat_send_bgwriter(void)
+{
+ /* We assume this initializes to zeroes */
+ static const PgStat_MsgBgWriter all_zeroes;
+
+ /*
+ * This function can be called even if nothing at all has happened.
+ * In this case, avoid sending a completely empty message to
+ * the stats collector.
+ */
+ if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
+ return;
+
+ /*
+ * Prepare and send the message
+ */
+ pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
+ pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
+
+ /*
+ * Clear out the statistics buffer, so it can be re-used.
+ */
+ MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
+}
+
/* ----------
* PgstatCollectorMain() -
pgstat_recv_analyze((PgStat_MsgAnalyze *) &msg, len);
break;
+ case PGSTAT_MTYPE_BGWRITER:
+ pgstat_recv_bgwriter((PgStat_MsgBgWriter *) &msg, len);
+ break;
+
default:
break;
}
format_id = PGSTAT_FILE_FORMAT_ID;
fwrite(&format_id, sizeof(format_id), 1, fpout);
+ /*
+ * Write global stats struct
+ */
+ fwrite(&globalStats, sizeof(globalStats), 1, fpout);
+
/*
* Walk through the database table.
*/
dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+ /*
+ * Clear out global statistics so they start from zero in case we can't
+ * load an existing statsfile.
+ */
+ memset(&globalStats, 0, sizeof(globalStats));
+
/*
* Try to open the status file. If it doesn't exist, the backends simply
* return zero for anything and the collector simply starts from scratch
goto done;
}
+ /*
+ * Read global stats struct
+ */
+ if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
+ {
+ ereport(pgStatRunningInCollector ? LOG : WARNING,
+ (errmsg("corrupted pgstat.stat file")));
+ goto done;
+ }
+
/*
* We found an existing collector stats file. Read it and put all the
* hashtable entries into place.
void
pgstat_clear_snapshot(void)
{
- /* In an autovacuum worker process we keep the stats forever */
- if (IsAutoVacuumWorkerProcess())
- return;
-
/* Release memory, if any was allocated */
if (pgStatLocalContext)
MemoryContextDelete(pgStatLocalContext);
* If it's a new table entry, initialize counters to the values we
* just got.
*/
- tabentry->numscans = tabmsg[i].t_numscans;
- tabentry->tuples_returned = tabmsg[i].t_tuples_returned;
- tabentry->tuples_fetched = tabmsg[i].t_tuples_fetched;
- tabentry->tuples_inserted = tabmsg[i].t_tuples_inserted;
- tabentry->tuples_updated = tabmsg[i].t_tuples_updated;
- tabentry->tuples_deleted = tabmsg[i].t_tuples_deleted;
-
- tabentry->n_live_tuples = tabmsg[i].t_tuples_inserted;
- tabentry->n_dead_tuples = tabmsg[i].t_tuples_updated +
- tabmsg[i].t_tuples_deleted;
+ tabentry->numscans = tabmsg[i].t_counts.t_numscans;
+ tabentry->tuples_returned = tabmsg[i].t_counts.t_tuples_returned;
+ tabentry->tuples_fetched = tabmsg[i].t_counts.t_tuples_fetched;
+ tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted;
+ tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated;
+ tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted;
+ tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples;
+ tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples;
+ tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched;
+ tabentry->blocks_hit = tabmsg[i].t_counts.t_blocks_hit;
+
tabentry->last_anl_tuples = 0;
tabentry->vacuum_timestamp = 0;
tabentry->autovac_vacuum_timestamp = 0;
tabentry->analyze_timestamp = 0;
tabentry->autovac_analyze_timestamp = 0;
-
- tabentry->blocks_fetched = tabmsg[i].t_blocks_fetched;
- tabentry->blocks_hit = tabmsg[i].t_blocks_hit;
}
else
{
/*
* Otherwise add the values to the existing entry.
*/
- tabentry->numscans += tabmsg[i].t_numscans;
- tabentry->tuples_returned += tabmsg[i].t_tuples_returned;
- tabentry->tuples_fetched += tabmsg[i].t_tuples_fetched;
- tabentry->tuples_inserted += tabmsg[i].t_tuples_inserted;
- tabentry->tuples_updated += tabmsg[i].t_tuples_updated;
- tabentry->tuples_deleted += tabmsg[i].t_tuples_deleted;
-
- tabentry->n_live_tuples += tabmsg[i].t_tuples_inserted -
- tabmsg[i].t_tuples_deleted;
- tabentry->n_dead_tuples += tabmsg[i].t_tuples_updated +
- tabmsg[i].t_tuples_deleted;
-
- tabentry->blocks_fetched += tabmsg[i].t_blocks_fetched;
- tabentry->blocks_hit += tabmsg[i].t_blocks_hit;
+ tabentry->numscans += tabmsg[i].t_counts.t_numscans;
+ tabentry->tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
+ tabentry->tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
+ tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
+ tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
+ tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+ tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples;
+ tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples;
+ tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
+ tabentry->blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
}
/*
- * Add table stats to the database entry.
- */
- dbentry->n_tuples_returned += tabmsg[i].t_tuples_returned;
- dbentry->n_tuples_fetched += tabmsg[i].t_tuples_fetched;
- dbentry->n_tuples_inserted += tabmsg[i].t_tuples_inserted;
- dbentry->n_tuples_updated += tabmsg[i].t_tuples_updated;
- dbentry->n_tuples_deleted += tabmsg[i].t_tuples_deleted;
-
- /*
- * And add the block IO to the database entry.
+ * Add per-table stats to the per-database entry, too.
*/
- dbentry->n_blocks_fetched += tabmsg[i].t_blocks_fetched;
- dbentry->n_blocks_hit += tabmsg[i].t_blocks_hit;
+ dbentry->n_tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
+ dbentry->n_tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
+ dbentry->n_tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
+ dbentry->n_tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
+ dbentry->n_tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+ dbentry->n_blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
+ dbentry->n_blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
}
}
tabentry->n_dead_tuples = msg->m_dead_tuples;
tabentry->last_anl_tuples = msg->m_live_tuples + msg->m_dead_tuples;
}
+
+
+/* ----------
+ * pgstat_recv_bgwriter() -
+ *
+ * Process a BGWRITER message.
+ * ----------
+ */
+static void
+pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
+{
+ globalStats.timed_checkpoints += msg->m_timed_checkpoints;
+ globalStats.requested_checkpoints += msg->m_requested_checkpoints;
+ globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
+ globalStats.buf_written_lru += msg->m_buf_written_lru;
+ globalStats.buf_written_all += msg->m_buf_written_all;
+ globalStats.maxwritten_lru += msg->m_maxwritten_lru;
+ globalStats.maxwritten_all += msg->m_maxwritten_all;
+}