X-Git-Url: https://granicus.if.org/sourcecode?a=blobdiff_plain;f=src%2Fbackend%2Fpostmaster%2Fpgstat.c;h=1d80c311d879d9cf9009621860cda3ab19c6dea9;hb=b4b6923e03f4d29636a94f6f4cc2f5cf6298b8c8;hp=b41a16de44ce86435068597a40e0fa3537ccd08b;hpb=77947c51c08179b8bc12347a7fbcb2c8d7908302;p=postgresql diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index b41a16de44..1d80c311d8 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -11,9 +11,9 @@ * - Add a pgstat config column to pg_database, so this * entire thing can be enabled/disabled on a per db basis. * - * Copyright (c) 2001-2007, PostgreSQL Global Development Group + * Copyright (c) 2001-2011, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.156 2007/05/27 03:50:39 tgl Exp $ + * src/backend/postmaster/pgstat.c * ---------- */ #include "postgres.h" @@ -42,11 +42,13 @@ #include "access/twophase_rmgr.h" #include "access/xact.h" #include "catalog/pg_database.h" +#include "catalog/pg_proc.h" #include "libpq/ip.h" #include "libpq/libpq.h" #include "libpq/pqsignal.h" #include "mb/pg_wchar.h" #include "miscadmin.h" +#include "pg_trace.h" #include "postmaster/autovacuum.h" #include "postmaster/fork_process.h" #include "postmaster/postmaster.h" @@ -55,23 +57,33 @@ #include "storage/ipc.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" +#include "storage/procsignal.h" +#include "utils/guc.h" #include "utils/memutils.h" #include "utils/ps_status.h" +#include "utils/rel.h" +#include "utils/tqual.h" /* ---------- * Paths for the statistics files (relative to installation's $PGDATA). * ---------- */ -#define PGSTAT_STAT_FILENAME "global/pgstat.stat" -#define PGSTAT_STAT_TMPFILE "global/pgstat.tmp" +#define PGSTAT_STAT_PERMANENT_FILENAME "global/pgstat.stat" +#define PGSTAT_STAT_PERMANENT_TMPFILE "global/pgstat.tmp" /* ---------- * Timer definitions. * ---------- */ -#define PGSTAT_STAT_INTERVAL 500 /* How often to write the status file; - * in milliseconds. */ +#define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file + * updates; in milliseconds. */ + +#define PGSTAT_RETRY_DELAY 10 /* How long to wait between statistics + * update requests; in milliseconds. */ + +#define PGSTAT_MAX_WAIT_TIME 5000 /* Maximum time to wait for a stats + * file update; in milliseconds. */ #define PGSTAT_RESTART_INTERVAL 60 /* How often to attempt to restart a * failed statistics collector; in @@ -80,6 +92,8 @@ #define PGSTAT_SELECT_TIMEOUT 2 /* How often to check for postmaster * death; in seconds. */ +#define PGSTAT_POLL_LOOP_COUNT (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY) + /* ---------- * The initial size hints for the hash tables used in the collector. @@ -87,17 +101,24 @@ */ #define PGSTAT_DB_HASH_SIZE 16 #define PGSTAT_TAB_HASH_SIZE 512 +#define PGSTAT_FUNCTION_HASH_SIZE 512 /* ---------- * GUC parameters * ---------- */ -bool pgstat_collect_startcollector = true; -bool pgstat_collect_resetonpmstart = false; -bool pgstat_collect_tuplelevel = false; -bool pgstat_collect_blocklevel = false; -bool pgstat_collect_querystring = false; +bool pgstat_track_activities = false; +bool pgstat_track_counts = false; +int pgstat_track_functions = TRACK_FUNC_OFF; +int pgstat_track_activity_query_size = 1024; + +/* ---------- + * Built from GUC parameter + * ---------- + */ +char *pgstat_stat_filename = NULL; +char *pgstat_stat_tmpname = NULL; /* * BgWriter global statistics counters (unused in other processes). @@ -110,7 +131,7 @@ PgStat_MsgBgWriter BgWriterStats; * Local data * ---------- */ -NON_EXEC_STATIC int pgStatSock = -1; +NON_EXEC_STATIC pgsocket pgStatSock = PGINVALID_SOCKET; static struct sockaddr_storage pgStatAddr; @@ -129,17 +150,29 @@ static bool pgStatRunningInCollector = false; * avoiding repeated searches in pgstat_initstats() when a relation is * repeatedly opened during a transaction. */ -#define TABSTAT_QUANTUM 100 /* we alloc this many at a time */ +#define TABSTAT_QUANTUM 100 /* we alloc this many at a time */ typedef struct TabStatusArray { struct TabStatusArray *tsa_next; /* link to next array, if any */ - int tsa_used; /* # entries currently used */ + int tsa_used; /* # entries currently used */ PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM]; /* per-table data */ } TabStatusArray; static TabStatusArray *pgStatTabList = NULL; +/* + * Backends store per-function info that's waiting to be sent to the collector + * in this hash table (indexed by function OID). + */ +static HTAB *pgStatFunctions = NULL; + +/* + * Indicates if backend has some function stats that it hasn't yet + * sent to the collector. + */ +static bool have_function_stats = false; + /* * Tuple insertion/deletion counts for an open transaction can't be propagated * into PgStat_TableStatus counters until we know if it is going to commit @@ -149,7 +182,7 @@ static TabStatusArray *pgStatTabList = NULL; */ typedef struct PgStat_SubXactStatus { - int nest_level; /* subtransaction nest level */ + int nest_level; /* subtransaction nest level */ struct PgStat_SubXactStatus *prev; /* higher-level subxact if any */ PgStat_TableXactStatus *first; /* head of list for this subxact */ } PgStat_SubXactStatus; @@ -162,10 +195,11 @@ static int pgStatXactRollback = 0; /* Record that's written to 2PC state file when pgstat state is persisted */ typedef struct TwoPhasePgStatRecord { - PgStat_Counter tuples_inserted; /* tuples inserted in xact */ - PgStat_Counter tuples_deleted; /* tuples deleted in xact */ - Oid t_id; /* table's OID */ - bool t_shared; /* is it a shared catalog? */ + PgStat_Counter tuples_inserted; /* tuples inserted in xact */ + PgStat_Counter tuples_updated; /* tuples updated in xact */ + PgStat_Counter tuples_deleted; /* tuples deleted in xact */ + Oid t_id; /* table's OID */ + bool t_shared; /* is it a shared catalog? */ } TwoPhasePgStatRecord; /* @@ -183,8 +217,21 @@ static int localNumBackends = 0; */ static PgStat_GlobalStats globalStats; +/* Last time the collector successfully wrote the stats file */ +static TimestampTz last_statwrite; + +/* Latest statistics request time from backends */ +static TimestampTz last_statrequest; + static volatile bool need_exit = false; -static volatile bool need_statwrite = false; +static volatile bool got_SIGHUP = false; + +/* + * Total time charged to functions so far in the current backend. + * We use this to help separate "self" and "other" time charges. + * (We assume this initializes to zero.) + */ +static instr_time total_func_time; /* ---------- @@ -197,16 +244,19 @@ static pid_t pgstat_forkexec(void); NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]); static void pgstat_exit(SIGNAL_ARGS); -static void force_statwrite(SIGNAL_ARGS); static void pgstat_beshutdown_hook(int code, Datum arg); +static void pgstat_sighup_handler(SIGNAL_ARGS); static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create); -static void pgstat_write_statsfile(void); -static HTAB *pgstat_read_statsfile(Oid onlydb); +static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, + Oid tableoid, bool create); +static void pgstat_write_statsfile(bool permanent); +static HTAB *pgstat_read_statsfile(Oid onlydb, bool permanent); static void backend_read_statsfile(void); static void pgstat_read_current_status(void); static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg); +static void pgstat_send_funcstats(void); static HTAB *pgstat_collect_oids(Oid catalogid); static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared); @@ -216,14 +266,20 @@ static void pgstat_setup_memcxt(void); static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype); static void pgstat_send(void *msg, int len); +static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len); static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len); static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len); static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len); static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len); +static void pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len); +static void pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len); static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len); static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len); static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len); static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len); +static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len); +static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len); +static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len); /* ------------------------------------------------------------ @@ -256,28 +312,6 @@ pgstat_init(void) #define TESTBYTEVAL ((char) 199) - /* - * Force start of collector daemon if something to collect. Note that - * pgstat_collect_querystring is now an independent facility that does not - * require the collector daemon. - */ - if (pgstat_collect_tuplelevel || - pgstat_collect_blocklevel) - pgstat_collect_startcollector = true; - - /* - * If we don't have to start a collector or should reset the collected - * statistics on postmaster start, simply remove the stats file. - */ - if (!pgstat_collect_startcollector || pgstat_collect_resetonpmstart) - pgstat_reset_all(); - - /* - * Nothing else required if collector will not get started - */ - if (!pgstat_collect_startcollector) - return; - /* * Create the UDP socket for sending and receiving statistic messages */ @@ -321,7 +355,7 @@ pgstat_init(void) /* * Create the socket. */ - if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) < 0) + if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) == PGINVALID_SOCKET) { ereport(LOG, (errcode_for_socket_access(), @@ -339,7 +373,7 @@ pgstat_init(void) (errcode_for_socket_access(), errmsg("could not bind socket for statistics collector: %m"))); closesocket(pgStatSock); - pgStatSock = -1; + pgStatSock = PGINVALID_SOCKET; continue; } @@ -350,7 +384,7 @@ pgstat_init(void) (errcode_for_socket_access(), errmsg("could not get address of socket for statistics collector: %m"))); closesocket(pgStatSock); - pgStatSock = -1; + pgStatSock = PGINVALID_SOCKET; continue; } @@ -366,7 +400,7 @@ pgstat_init(void) (errcode_for_socket_access(), errmsg("could not connect socket for statistics collector: %m"))); closesocket(pgStatSock); - pgStatSock = -1; + pgStatSock = PGINVALID_SOCKET; continue; } @@ -387,7 +421,7 @@ retry1: (errcode_for_socket_access(), errmsg("could not send test message on socket for statistics collector: %m"))); closesocket(pgStatSock); - pgStatSock = -1; + pgStatSock = PGINVALID_SOCKET; continue; } @@ -400,6 +434,7 @@ retry1: { FD_ZERO(&rset); FD_SET(pgStatSock, &rset); + tv.tv_sec = 0; tv.tv_usec = 500000; sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv); @@ -412,7 +447,7 @@ retry1: (errcode_for_socket_access(), errmsg("select() failed in statistics collector: %m"))); closesocket(pgStatSock); - pgStatSock = -1; + pgStatSock = PGINVALID_SOCKET; continue; } if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset)) @@ -427,7 +462,7 @@ retry1: (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("test message did not get through on socket for statistics collector"))); closesocket(pgStatSock); - pgStatSock = -1; + pgStatSock = PGINVALID_SOCKET; continue; } @@ -442,7 +477,7 @@ retry2: (errcode_for_socket_access(), errmsg("could not receive test message on socket for statistics collector: %m"))); closesocket(pgStatSock); - pgStatSock = -1; + pgStatSock = PGINVALID_SOCKET; continue; } @@ -452,7 +487,7 @@ retry2: (errcode(ERRCODE_INTERNAL_ERROR), errmsg("incorrect test message transmission on socket for statistics collector"))); closesocket(pgStatSock); - pgStatSock = -1; + pgStatSock = PGINVALID_SOCKET; continue; } @@ -461,7 +496,7 @@ retry2: } /* Did we find a working address? */ - if (!addr || pgStatSock < 0) + if (!addr || pgStatSock == PGINVALID_SOCKET) goto startup_failed; /* @@ -488,27 +523,30 @@ startup_failed: if (addrs) pg_freeaddrinfo_all(hints.ai_family, addrs); - if (pgStatSock >= 0) + if (pgStatSock != PGINVALID_SOCKET) closesocket(pgStatSock); - pgStatSock = -1; + pgStatSock = PGINVALID_SOCKET; - /* Adjust GUC variables to suppress useless activity */ - pgstat_collect_startcollector = false; - pgstat_collect_tuplelevel = false; - pgstat_collect_blocklevel = false; + /* + * Adjust GUC variables to suppress useless activity, and for debugging + * purposes (seeing track_counts off is a clue that we failed here). We + * use PGC_S_OVERRIDE because there is no point in trying to turn it back + * on from postgresql.conf without a restart. + */ + SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE); } /* * pgstat_reset_all() - * - * Remove the stats file. This is used on server start if the - * stats_reset_on_server_start feature is enabled, or if WAL + * Remove the stats file. This is currently used only if WAL * recovery is needed after a crash. */ void pgstat_reset_all(void) { - unlink(PGSTAT_STAT_FILENAME); + unlink(pgstat_stat_filename); + unlink(PGSTAT_STAT_PERMANENT_FILENAME); } #ifdef EXEC_BACKEND @@ -536,7 +574,7 @@ pgstat_forkexec(void) #endif /* EXEC_BACKEND */ -/* ---------- +/* * pgstat_start() - * * Called from postmaster at startup or after an existing collector @@ -545,7 +583,6 @@ pgstat_forkexec(void) * Returns PID of child process, or 0 if fail. * * Note: if fail, we will be called again from the postmaster main loop. - * ---------- */ int pgstat_start(void) @@ -554,9 +591,10 @@ pgstat_start(void) pid_t pgStatPid; /* - * Do nothing if no collector needed + * Check that the socket is there, else pgstat_init failed and we can do + * nothing useful. */ - if (!pgstat_collect_startcollector) + if (pgStatSock == PGINVALID_SOCKET) return 0; /* @@ -571,22 +609,6 @@ pgstat_start(void) return 0; last_pgstat_start_time = curtime; - /* - * Check that the socket is there, else pgstat_init failed. - */ - if (pgStatSock < 0) - { - ereport(LOG, - (errmsg("statistics collector startup skipped"))); - - /* - * We can only get here if someone tries to manually turn - * pgstat_collect_startcollector on after it had been off. - */ - pgstat_collect_startcollector = false; - return 0; - } - /* * Okay, fork off the collector. */ @@ -625,9 +647,10 @@ pgstat_start(void) return 0; } -void allow_immediate_pgstat_restart(void) +void +allow_immediate_pgstat_restart(void) { - last_pgstat_start_time = 0; + last_pgstat_start_time = 0; } /* ------------------------------------------------------------ @@ -637,20 +660,20 @@ void allow_immediate_pgstat_restart(void) /* ---------- - * pgstat_report_tabstat() - + * pgstat_report_stat() - * * Called from tcop/postgres.c to send the so far collected per-table - * access statistics to the collector. Note that this is called only - * when not within a transaction, so it is fair to use transaction stop - * time as an approximation of current time. + * and function usage statistics to the collector. Note that this is + * called only when not within a transaction, so it is fair to use + * transaction stop time as an approximation of current time. * ---------- */ void -pgstat_report_tabstat(bool force) +pgstat_report_stat(bool force) { /* we assume this inits to all zeroes: */ static const PgStat_TableCounts all_zeroes; - static TimestampTz last_report = 0; + static TimestampTz last_report = 0; TimestampTz now; PgStat_MsgTabstat regular_msg; @@ -659,8 +682,8 @@ pgstat_report_tabstat(bool force) int i; /* Don't expend a clock check if nothing to do */ - if (pgStatTabList == NULL || - pgStatTabList->tsa_used == 0) + if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) + && !have_function_stats) return; /* @@ -676,8 +699,8 @@ pgstat_report_tabstat(bool force) /* * Scan through the TabStatusArray struct(s) to find tables that actually * have counts, and build messages to send. We have to separate shared - * relations from regular ones because the databaseid field in the - * message header has to depend on that. + * relations from regular ones because the databaseid field in the message + * header has to depend on that. */ regular_msg.m_databaseid = MyDatabaseId; shared_msg.m_databaseid = InvalidOid; @@ -696,12 +719,13 @@ pgstat_report_tabstat(bool force) Assert(entry->trans == NULL); /* - * Ignore entries that didn't accumulate any actual counts, - * such as indexes that were opened by the planner but not used. + * Ignore entries that didn't accumulate any actual counts, such + * as indexes that were opened by the planner but not used. */ if (memcmp(&entry->t_counts, &all_zeroes, sizeof(PgStat_TableCounts)) == 0) continue; + /* * OK, insert data into the appropriate message, and send if full. */ @@ -731,10 +755,13 @@ pgstat_report_tabstat(bool force) pgstat_send_tabstat(®ular_msg); if (shared_msg.m_nentries > 0) pgstat_send_tabstat(&shared_msg); + + /* Now, send function statistics */ + pgstat_send_funcstats(); } /* - * Subroutine for pgstat_report_tabstat: finish and send a tabstat message + * Subroutine for pgstat_report_stat: finish and send a tabstat message */ static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg) @@ -743,7 +770,7 @@ pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg) int len; /* It's unlikely we'd get here with no socket, but maybe not impossible */ - if (pgStatSock < 0) + if (pgStatSock == PGINVALID_SOCKET) return; /* @@ -771,24 +798,81 @@ pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg) pgstat_send(tsmsg, len); } +/* + * Subroutine for pgstat_report_stat: populate and send a function stat message + */ +static void +pgstat_send_funcstats(void) +{ + /* we assume this inits to all zeroes: */ + static const PgStat_FunctionCounts all_zeroes; + + PgStat_MsgFuncstat msg; + PgStat_BackendFunctionEntry *entry; + HASH_SEQ_STATUS fstat; + + if (pgStatFunctions == NULL) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FUNCSTAT); + msg.m_databaseid = MyDatabaseId; + msg.m_nentries = 0; + + hash_seq_init(&fstat, pgStatFunctions); + while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL) + { + PgStat_FunctionEntry *m_ent; + + /* Skip it if no counts accumulated since last time */ + if (memcmp(&entry->f_counts, &all_zeroes, + sizeof(PgStat_FunctionCounts)) == 0) + continue; + + /* need to convert format of time accumulators */ + m_ent = &msg.m_entry[msg.m_nentries]; + m_ent->f_id = entry->f_id; + m_ent->f_numcalls = entry->f_counts.f_numcalls; + m_ent->f_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_time); + m_ent->f_time_self = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_time_self); + + if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES) + { + pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) + + msg.m_nentries * sizeof(PgStat_FunctionEntry)); + msg.m_nentries = 0; + } + + /* reset the entry's counts */ + MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts)); + } + + if (msg.m_nentries > 0) + pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) + + msg.m_nentries * sizeof(PgStat_FunctionEntry)); + + have_function_stats = false; +} + /* ---------- - * pgstat_vacuum_tabstat() - + * pgstat_vacuum_stat() - * * Will tell the collector about objects he can get rid of. * ---------- */ void -pgstat_vacuum_tabstat(void) +pgstat_vacuum_stat(void) { HTAB *htab; PgStat_MsgTabpurge msg; + PgStat_MsgFuncpurge f_msg; HASH_SEQ_STATUS hstat; PgStat_StatDBEntry *dbentry; PgStat_StatTabEntry *tabentry; + PgStat_StatFuncEntry *funcentry; int len; - if (pgStatSock < 0) + if (pgStatSock == PGINVALID_SOCKET) return; /* @@ -813,7 +897,9 @@ pgstat_vacuum_tabstat(void) CHECK_FOR_INTERRUPTS(); - if (hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL) + /* the DB entry for shared tables (with InvalidOid) is never dropped */ + if (OidIsValid(dbid) && + hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL) pgstat_drop_database(dbid); } @@ -888,15 +974,72 @@ pgstat_vacuum_tabstat(void) /* Clean up */ hash_destroy(htab); + + /* + * Now repeat the above steps for functions. However, we needn't bother + * in the common case where no function stats are being collected. + */ + if (dbentry->functions != NULL && + hash_get_num_entries(dbentry->functions) > 0) + { + htab = pgstat_collect_oids(ProcedureRelationId); + + pgstat_setheader(&f_msg.m_hdr, PGSTAT_MTYPE_FUNCPURGE); + f_msg.m_databaseid = MyDatabaseId; + f_msg.m_nentries = 0; + + hash_seq_init(&hstat, dbentry->functions); + while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL) + { + Oid funcid = funcentry->functionid; + + CHECK_FOR_INTERRUPTS(); + + if (hash_search(htab, (void *) &funcid, HASH_FIND, NULL) != NULL) + continue; + + /* + * Not there, so add this function's Oid to the message + */ + f_msg.m_functionid[f_msg.m_nentries++] = funcid; + + /* + * If the message is full, send it out and reinitialize to empty + */ + if (f_msg.m_nentries >= PGSTAT_NUM_FUNCPURGE) + { + len = offsetof(PgStat_MsgFuncpurge, m_functionid[0]) + +f_msg.m_nentries * sizeof(Oid); + + pgstat_send(&f_msg, len); + + f_msg.m_nentries = 0; + } + } + + /* + * Send the rest + */ + if (f_msg.m_nentries > 0) + { + len = offsetof(PgStat_MsgFuncpurge, m_functionid[0]) + +f_msg.m_nentries * sizeof(Oid); + + pgstat_send(&f_msg, len); + } + + hash_destroy(htab); + } } /* ---------- * pgstat_collect_oids() - * - * Collect the OIDs of either all databases or all tables, according to - * the parameter, into a temporary hash table. Caller should hash_destroy - * the result when done with it. + * Collect the OIDs of all objects listed in the specified system catalog + * into a temporary hash table. Caller should hash_destroy the result + * when done with it. (However, we make the table in CurrentMemoryContext + * so that it will be freed properly in event of an error.) * ---------- */ static HTAB * @@ -912,16 +1055,17 @@ pgstat_collect_oids(Oid catalogid) hash_ctl.keysize = sizeof(Oid); hash_ctl.entrysize = sizeof(Oid); hash_ctl.hash = oid_hash; + hash_ctl.hcxt = CurrentMemoryContext; htab = hash_create("Temporary table of OIDs", PGSTAT_TAB_HASH_SIZE, &hash_ctl, - HASH_ELEM | HASH_FUNCTION); + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); rel = heap_open(catalogid, AccessShareLock); scan = heap_beginscan(rel, SnapshotNow, 0, NULL); while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL) { - Oid thisoid = HeapTupleGetOid(tup); + Oid thisoid = HeapTupleGetOid(tup); CHECK_FOR_INTERRUPTS(); @@ -939,7 +1083,7 @@ pgstat_collect_oids(Oid catalogid) * * Tell the collector that we just dropped a database. * (If the message gets lost, we will still clean the dead DB eventually - * via future invocations of pgstat_vacuum_tabstat().) + * via future invocations of pgstat_vacuum_stat().) * ---------- */ void @@ -947,7 +1091,7 @@ pgstat_drop_database(Oid databaseid) { PgStat_MsgDropdb msg; - if (pgStatSock < 0) + if (pgStatSock == PGINVALID_SOCKET) return; pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB); @@ -961,16 +1105,20 @@ pgstat_drop_database(Oid databaseid) * * Tell the collector that we just dropped a relation. * (If the message gets lost, we will still clean the dead entry eventually - * via future invocations of pgstat_vacuum_tabstat().) + * via future invocations of pgstat_vacuum_stat().) + * + * Currently not used for lack of any good place to call it; we rely + * entirely on pgstat_vacuum_stat() to clean out stats for dead rels. * ---------- */ +#ifdef NOT_USED void pgstat_drop_relation(Oid relid) { PgStat_MsgTabpurge msg; int len; - if (pgStatSock < 0) + if (pgStatSock == PGINVALID_SOCKET) return; msg.m_tableid[0] = relid; @@ -982,6 +1130,7 @@ pgstat_drop_relation(Oid relid) msg.m_databaseid = MyDatabaseId; pgstat_send(&msg, len); } +#endif /* NOT_USED */ /* ---------- @@ -995,7 +1144,7 @@ pgstat_reset_counters(void) { PgStat_MsgResetcounter msg; - if (pgStatSock < 0) + if (pgStatSock == PGINVALID_SOCKET) return; if (!superuser()) @@ -1008,6 +1157,63 @@ pgstat_reset_counters(void) pgstat_send(&msg, sizeof(msg)); } +/* ---------- + * pgstat_reset_shared_counters() - + * + * Tell the statistics collector to reset cluster-wide shared counters. + * ---------- + */ +void +pgstat_reset_shared_counters(const char *target) +{ + PgStat_MsgResetsharedcounter msg; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to reset statistics counters"))); + + if (strcmp(target, "bgwriter") == 0) + msg.m_resettarget = RESET_BGWRITER; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized reset target: \"%s\"", target), + errhint("Target must be \"bgwriter\"."))); + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER); + pgstat_send(&msg, sizeof(msg)); +} + +/* ---------- + * pgstat_reset_single_counter() - + * + * Tell the statistics collector to reset a single counter. + * ---------- + */ +void +pgstat_reset_single_counter(Oid objoid, PgStat_Single_Reset_Type type) +{ + PgStat_MsgResetsinglecounter msg; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to reset statistics counters"))); + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSINGLECOUNTER); + msg.m_databaseid = MyDatabaseId; + msg.m_resettype = type; + msg.m_objectid = objoid; + + pgstat_send(&msg, sizeof(msg)); +} /* ---------- * pgstat_report_autovac() - @@ -1022,7 +1228,7 @@ pgstat_report_autovac(Oid dboid) { PgStat_MsgAutovacStart msg; - if (pgStatSock < 0) + if (pgStatSock == PGINVALID_SOCKET) return; pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START); @@ -1040,20 +1246,17 @@ pgstat_report_autovac(Oid dboid) * --------- */ void -pgstat_report_vacuum(Oid tableoid, bool shared, - bool analyze, PgStat_Counter tuples) +pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter tuples) { PgStat_MsgVacuum msg; - if (pgStatSock < 0 || - !pgstat_collect_tuplelevel) + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) return; pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM); msg.m_databaseid = shared ? InvalidOid : MyDatabaseId; msg.m_tableoid = tableoid; - msg.m_analyze = analyze; - msg.m_autovacuum = IsAutoVacuumWorkerProcess(); /* is this autovacuum? */ + msg.m_autovacuum = IsAutoVacuumWorkerProcess(); msg.m_vacuumtime = GetCurrentTimestamp(); msg.m_tuples = tuples; pgstat_send(&msg, sizeof(msg)); @@ -1066,25 +1269,69 @@ pgstat_report_vacuum(Oid tableoid, bool shared, * -------- */ void -pgstat_report_analyze(Oid tableoid, bool shared, PgStat_Counter livetuples, - PgStat_Counter deadtuples) +pgstat_report_analyze(Relation rel, + PgStat_Counter livetuples, PgStat_Counter deadtuples) { PgStat_MsgAnalyze msg; - if (pgStatSock < 0 || - !pgstat_collect_tuplelevel) + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) return; + /* + * Unlike VACUUM, ANALYZE might be running inside a transaction that has + * already inserted and/or deleted rows in the target table. ANALYZE will + * have counted such rows as live or dead respectively. Because we will + * report our counts of such rows at transaction end, we should subtract + * off these counts from what we send to the collector now, else they'll + * be double-counted after commit. (This approach also ensures that the + * collector ends up with the right numbers if we abort instead of + * committing.) + */ + if (rel->pgstat_info != NULL) + { + PgStat_TableXactStatus *trans; + + for (trans = rel->pgstat_info->trans; trans; trans = trans->upper) + { + livetuples -= trans->tuples_inserted - trans->tuples_deleted; + deadtuples -= trans->tuples_updated + trans->tuples_deleted; + } + /* count stuff inserted by already-aborted subxacts, too */ + deadtuples -= rel->pgstat_info->t_counts.t_delta_dead_tuples; + /* Since ANALYZE's counts are estimates, we could have underflowed */ + livetuples = Max(livetuples, 0); + deadtuples = Max(deadtuples, 0); + } + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE); - msg.m_databaseid = shared ? InvalidOid : MyDatabaseId; - msg.m_tableoid = tableoid; - msg.m_autovacuum = IsAutoVacuumWorkerProcess(); /* is this autovacuum? */ + msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId; + msg.m_tableoid = RelationGetRelid(rel); + msg.m_autovacuum = IsAutoVacuumWorkerProcess(); msg.m_analyzetime = GetCurrentTimestamp(); msg.m_live_tuples = livetuples; msg.m_dead_tuples = deadtuples; pgstat_send(&msg, sizeof(msg)); } +/* -------- + * pgstat_report_recovery_conflict() - + * + * Tell the collector about a Hot Standby recovery conflict. + * -------- + */ +void +pgstat_report_recovery_conflict(int reason) +{ + PgStat_MsgRecoveryConflict msg; + + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYCONFLICT); + msg.m_databaseid = MyDatabaseId; + msg.m_reason = reason; + pgstat_send(&msg, sizeof(msg)); +} /* ---------- * pgstat_ping() - @@ -1097,13 +1344,152 @@ pgstat_ping(void) { PgStat_MsgDummy msg; - if (pgStatSock < 0) + if (pgStatSock == PGINVALID_SOCKET) return; pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY); pgstat_send(&msg, sizeof(msg)); } +/* ---------- + * pgstat_send_inquiry() - + * + * Notify collector that we need fresh data. + * ts specifies the minimum acceptable timestamp for the stats file. + * ---------- + */ +static void +pgstat_send_inquiry(TimestampTz ts) +{ + PgStat_MsgInquiry msg; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY); + msg.inquiry_time = ts; + pgstat_send(&msg, sizeof(msg)); +} + + +/* + * Initialize function call usage data. + * Called by the executor before invoking a function. + */ +void +pgstat_init_function_usage(FunctionCallInfoData *fcinfo, + PgStat_FunctionCallUsage *fcu) +{ + PgStat_BackendFunctionEntry *htabent; + bool found; + + if (pgstat_track_functions <= fcinfo->flinfo->fn_stats) + { + /* stats not wanted */ + fcu->fs = NULL; + return; + } + + if (!pgStatFunctions) + { + /* First time through - initialize function stat table */ + HASHCTL hash_ctl; + + memset(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(PgStat_BackendFunctionEntry); + hash_ctl.hash = oid_hash; + pgStatFunctions = hash_create("Function stat entries", + PGSTAT_FUNCTION_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_FUNCTION); + } + + /* Get the stats entry for this function, create if necessary */ + htabent = hash_search(pgStatFunctions, &fcinfo->flinfo->fn_oid, + HASH_ENTER, &found); + if (!found) + MemSet(&htabent->f_counts, 0, sizeof(PgStat_FunctionCounts)); + + fcu->fs = &htabent->f_counts; + + /* save stats for this function, later used to compensate for recursion */ + fcu->save_f_time = htabent->f_counts.f_time; + + /* save current backend-wide total time */ + fcu->save_total = total_func_time; + + /* get clock time as of function start */ + INSTR_TIME_SET_CURRENT(fcu->f_start); +} + +/* + * find_funcstat_entry - find any existing PgStat_BackendFunctionEntry entry + * for specified function + * + * If no entry, return NULL, don't create a new one + */ +PgStat_BackendFunctionEntry * +find_funcstat_entry(Oid func_id) +{ + if (pgStatFunctions == NULL) + return NULL; + + return (PgStat_BackendFunctionEntry *) hash_search(pgStatFunctions, + (void *) &func_id, + HASH_FIND, NULL); +} + +/* + * Calculate function call usage and update stat counters. + * Called by the executor after invoking a function. + * + * In the case of a set-returning function that runs in value-per-call mode, + * we will see multiple pgstat_init_function_usage/pgstat_end_function_usage + * calls for what the user considers a single call of the function. The + * finalize flag should be TRUE on the last call. + */ +void +pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize) +{ + PgStat_FunctionCounts *fs = fcu->fs; + instr_time f_total; + instr_time f_others; + instr_time f_self; + + /* stats not wanted? */ + if (fs == NULL) + return; + + /* total elapsed time in this function call */ + INSTR_TIME_SET_CURRENT(f_total); + INSTR_TIME_SUBTRACT(f_total, fcu->f_start); + + /* self usage: elapsed minus anything already charged to other calls */ + f_others = total_func_time; + INSTR_TIME_SUBTRACT(f_others, fcu->save_total); + f_self = f_total; + INSTR_TIME_SUBTRACT(f_self, f_others); + + /* update backend-wide total time */ + INSTR_TIME_ADD(total_func_time, f_self); + + /* + * Compute the new total f_time as the total elapsed time added to the + * pre-call value of f_time. This is necessary to avoid double-counting + * any time taken by recursive calls of myself. (We do not need any + * similar kluge for self time, since that already excludes any recursive + * calls.) + */ + INSTR_TIME_ADD(f_total, fcu->save_f_time); + + /* update counters in function stats table */ + if (finalize) + fs->f_numcalls++; + fs->f_time = f_total; + INSTR_TIME_ADD(fs->f_time_self, f_self); + + /* indicate that we have something to send */ + have_function_stats = true; +} + /* ---------- * pgstat_initstats() - @@ -1126,15 +1512,14 @@ pgstat_initstats(Relation rel) /* We only count stats for things that have storage */ if (!(relkind == RELKIND_RELATION || relkind == RELKIND_INDEX || - relkind == RELKIND_TOASTVALUE)) + relkind == RELKIND_TOASTVALUE || + relkind == RELKIND_SEQUENCE)) { rel->pgstat_info = NULL; return; } - if (pgStatSock < 0 || - !(pgstat_collect_tuplelevel || - pgstat_collect_blocklevel)) + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) { /* We're not counting at all */ rel->pgstat_info = NULL; @@ -1142,8 +1527,8 @@ pgstat_initstats(Relation rel) } /* - * If we already set up this relation in the current transaction, - * nothing to do. + * If we already set up this relation in the current transaction, nothing + * to do. */ if (rel->pgstat_info != NULL && rel->pgstat_info->t_id == rel_id) @@ -1180,9 +1565,9 @@ get_tabstat_entry(Oid rel_id, bool isshared) if (tsa->tsa_used < TABSTAT_QUANTUM) { /* - * It must not be present, but we found a free slot instead. - * Fine, let's use this one. We assume the entry was already - * zeroed, either at creation or after last use. + * It must not be present, but we found a free slot instead. Fine, + * let's use this one. We assume the entry was already zeroed, + * either at creation or after last use. */ entry = &tsa->tsa_entries[tsa->tsa_used++]; entry->t_id = rel_id; @@ -1210,6 +1595,32 @@ get_tabstat_entry(Oid rel_id, bool isshared) return entry; } +/* + * find_tabstat_entry - find any existing PgStat_TableStatus entry for rel + * + * If no entry, return NULL, don't create a new one + */ +PgStat_TableStatus * +find_tabstat_entry(Oid rel_id) +{ + PgStat_TableStatus *entry; + TabStatusArray *tsa; + int i; + + for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next) + { + for (i = 0; i < tsa->tsa_used; i++) + { + entry = &tsa->tsa_entries[i]; + if (entry->t_id == rel_id) + return entry; + } + } + + /* Not present */ + return NULL; +} + /* * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed */ @@ -1242,8 +1653,8 @@ add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level) PgStat_TableXactStatus *trans; /* - * If this is the first rel to be modified at the current nest level, - * we first have to push a transaction stack entry. + * If this is the first rel to be modified at the current nest level, we + * first have to push a transaction stack entry. */ xact_state = get_tabstat_stack_level(nest_level); @@ -1267,14 +1678,11 @@ pgstat_count_heap_insert(Relation rel) { PgStat_TableStatus *pgstat_info = rel->pgstat_info; - if (pgstat_collect_tuplelevel && pgstat_info != NULL) + if (pgstat_info != NULL) { - int nest_level = GetCurrentTransactionNestLevel(); + /* We have to log the effect at the proper transactional level */ + int nest_level = GetCurrentTransactionNestLevel(); - /* t_tuples_inserted is nontransactional, so just advance it */ - pgstat_info->t_counts.t_tuples_inserted++; - - /* We have to log the transactional effect at the proper level */ if (pgstat_info->trans == NULL || pgstat_info->trans->nest_level != nest_level) add_tabstat_xact_level(pgstat_info, nest_level); @@ -1287,25 +1695,24 @@ pgstat_count_heap_insert(Relation rel) * pgstat_count_heap_update - count a tuple update */ void -pgstat_count_heap_update(Relation rel) +pgstat_count_heap_update(Relation rel, bool hot) { PgStat_TableStatus *pgstat_info = rel->pgstat_info; - if (pgstat_collect_tuplelevel && pgstat_info != NULL) + if (pgstat_info != NULL) { - int nest_level = GetCurrentTransactionNestLevel(); - - /* t_tuples_updated is nontransactional, so just advance it */ - pgstat_info->t_counts.t_tuples_updated++; + /* We have to log the effect at the proper transactional level */ + int nest_level = GetCurrentTransactionNestLevel(); - /* We have to log the transactional effect at the proper level */ if (pgstat_info->trans == NULL || pgstat_info->trans->nest_level != nest_level) add_tabstat_xact_level(pgstat_info, nest_level); - /* An UPDATE both inserts a new tuple and deletes the old */ - pgstat_info->trans->tuples_inserted++; - pgstat_info->trans->tuples_deleted++; + pgstat_info->trans->tuples_updated++; + + /* t_tuples_hot_updated is nontransactional, so just advance it */ + if (hot) + pgstat_info->t_counts.t_tuples_hot_updated++; } } @@ -1317,14 +1724,11 @@ pgstat_count_heap_delete(Relation rel) { PgStat_TableStatus *pgstat_info = rel->pgstat_info; - if (pgstat_collect_tuplelevel && pgstat_info != NULL) + if (pgstat_info != NULL) { - int nest_level = GetCurrentTransactionNestLevel(); - - /* t_tuples_deleted is nontransactional, so just advance it */ - pgstat_info->t_counts.t_tuples_deleted++; + /* We have to log the effect at the proper transactional level */ + int nest_level = GetCurrentTransactionNestLevel(); - /* We have to log the transactional effect at the proper level */ if (pgstat_info->trans == NULL || pgstat_info->trans->nest_level != nest_level) add_tabstat_xact_level(pgstat_info, nest_level); @@ -1333,6 +1737,23 @@ pgstat_count_heap_delete(Relation rel) } } +/* + * pgstat_update_heap_dead_tuples - update dead-tuples count + * + * The semantics of this are that we are reporting the nontransactional + * recovery of "delta" dead tuples; so t_delta_dead_tuples decreases + * rather than increasing, and the change goes straight into the per-table + * counter, not into transactional state. + */ +void +pgstat_update_heap_dead_tuples(Relation rel, int delta) +{ + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + if (pgstat_info != NULL) + pgstat_info->t_counts.t_delta_dead_tuples -= delta; +} + /* ---------- * AtEOXact_PgStat @@ -1356,8 +1777,8 @@ AtEOXact_PgStat(bool isCommit) /* * Transfer transactional insert/update counts into the base tabstat - * entries. We don't bother to free any of the transactional state, - * since it's all in TopTransactionContext and will go away anyway. + * entries. We don't bother to free any of the transactional state, since + * it's all in TopTransactionContext and will go away anyway. */ xact_state = pgStatXactStack; if (xact_state != NULL) @@ -1374,15 +1795,29 @@ AtEOXact_PgStat(bool isCommit) Assert(trans->upper == NULL); tabstat = trans->parent; Assert(tabstat->trans == trans); + /* count attempted actions regardless of commit/abort */ + tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted; + tabstat->t_counts.t_tuples_updated += trans->tuples_updated; + tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted; if (isCommit) { - tabstat->t_counts.t_new_live_tuples += trans->tuples_inserted; - tabstat->t_counts.t_new_dead_tuples += trans->tuples_deleted; + /* insert adds a live tuple, delete removes one */ + tabstat->t_counts.t_delta_live_tuples += + trans->tuples_inserted - trans->tuples_deleted; + /* update and delete each create a dead tuple */ + tabstat->t_counts.t_delta_dead_tuples += + trans->tuples_updated + trans->tuples_deleted; + /* insert, update, delete each count as one change event */ + tabstat->t_counts.t_changed_tuples += + trans->tuples_inserted + trans->tuples_updated + + trans->tuples_deleted; } else { /* inserted tuples are dead, deleted tuples are unaffected */ - tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted; + tabstat->t_counts.t_delta_dead_tuples += + trans->tuples_inserted + trans->tuples_updated; + /* an aborted xact generates no changed_tuple events */ } tabstat->trans = NULL; } @@ -1431,6 +1866,7 @@ AtEOSubXact_PgStat(bool isCommit, int nestDepth) if (trans->upper && trans->upper->nest_level == nestDepth - 1) { trans->upper->tuples_inserted += trans->tuples_inserted; + trans->upper->tuples_updated += trans->tuples_updated; trans->upper->tuples_deleted += trans->tuples_deleted; tabstat->trans = trans->upper; pfree(trans); @@ -1438,11 +1874,11 @@ AtEOSubXact_PgStat(bool isCommit, int nestDepth) else { /* - * When there isn't an immediate parent state, we can - * just reuse the record instead of going through a + * When there isn't an immediate parent state, we can just + * reuse the record instead of going through a * palloc/pfree pushup (this works since it's all in - * TopTransactionContext anyway). We have to re-link - * it into the parent level, though, and that might mean + * TopTransactionContext anyway). We have to re-link it + * into the parent level, though, and that might mean * pushing a new entry into the pgStatXactStack. */ PgStat_SubXactStatus *upper_xact_state; @@ -1456,10 +1892,17 @@ AtEOSubXact_PgStat(bool isCommit, int nestDepth) else { /* - * On abort, inserted tuples are dead (and can be bounced out - * to the top-level tabstat), deleted tuples are unaffected + * On abort, update top-level tabstat counts, then forget the + * subtransaction */ - tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted; + + /* count attempted actions regardless of commit/abort */ + tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted; + tabstat->t_counts.t_tuples_updated += trans->tuples_updated; + tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted; + /* inserted tuples are dead, deleted tuples are unaffected */ + tabstat->t_counts.t_delta_dead_tuples += + trans->tuples_inserted + trans->tuples_updated; tabstat->trans = trans->upper; pfree(trans); } @@ -1499,6 +1942,7 @@ AtPrepare_PgStat(void) Assert(tabstat->trans == trans); record.tuples_inserted = trans->tuples_inserted; + record.tuples_updated = trans->tuples_updated; record.tuples_deleted = trans->tuples_deleted; record.t_id = tabstat->t_id; record.t_shared = tabstat->t_shared; @@ -1514,7 +1958,7 @@ AtPrepare_PgStat(void) * Clean up after successful PREPARE. * * All we need do here is unlink the transaction stats state from the - * nontransactional state. The nontransactional action counts will be + * nontransactional state. The nontransactional action counts will be * reported to the stats collector immediately, while the effects on live * and dead tuple counts are preserved in the 2PC state file. * @@ -1526,8 +1970,8 @@ PostPrepare_PgStat(void) PgStat_SubXactStatus *xact_state; /* - * We don't bother to free any of the transactional state, - * since it's all in TopTransactionContext and will go away anyway. + * We don't bother to free any of the transactional state, since it's all + * in TopTransactionContext and will go away anyway. */ xact_state = pgStatXactStack; if (xact_state != NULL) @@ -1563,8 +2007,17 @@ pgstat_twophase_postcommit(TransactionId xid, uint16 info, /* Find or create a tabstat entry for the rel */ pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared); - pgstat_info->t_counts.t_new_live_tuples += rec->tuples_inserted; - pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_deleted; + /* Same math as in AtEOXact_PgStat, commit case */ + pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted; + pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated; + pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted; + pgstat_info->t_counts.t_delta_live_tuples += + rec->tuples_inserted - rec->tuples_deleted; + pgstat_info->t_counts.t_delta_dead_tuples += + rec->tuples_updated + rec->tuples_deleted; + pgstat_info->t_counts.t_changed_tuples += + rec->tuples_inserted + rec->tuples_updated + + rec->tuples_deleted; } /* @@ -1583,8 +2036,12 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info, /* Find or create a tabstat entry for the rel */ pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared); - /* inserted tuples are dead, deleted tuples are no-ops */ - pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_inserted; + /* Same math as in AtEOXact_PgStat, abort case */ + pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted; + pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated; + pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted; + pgstat_info->t_counts.t_delta_dead_tuples += + rec->tuples_inserted + rec->tuples_updated; } @@ -1673,6 +2130,35 @@ pgstat_fetch_stat_tabentry(Oid relid) } +/* ---------- + * pgstat_fetch_stat_funcentry() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one function or NULL. + * ---------- + */ +PgStat_StatFuncEntry * +pgstat_fetch_stat_funcentry(Oid func_id) +{ + PgStat_StatDBEntry *dbentry; + PgStat_StatFuncEntry *funcentry = NULL; + + /* load the stats file if needed */ + backend_read_statsfile(); + + /* Lookup our database, then find the requested function. */ + dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); + if (dbentry != NULL && dbentry->functions != NULL) + { + funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions, + (void *) &func_id, + HASH_FIND, NULL); + } + + return funcentry; +} + + /* ---------- * pgstat_fetch_stat_beentry() - * @@ -1714,8 +2200,8 @@ pgstat_fetch_stat_numbackends(void) * --------- * pgstat_fetch_global() - * - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the global statistics struct. + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the global statistics struct. * --------- */ PgStat_GlobalStats * @@ -1734,6 +2220,9 @@ pgstat_fetch_global(void) static PgBackendStatus *BackendStatusArray = NULL; static PgBackendStatus *MyBEEntry = NULL; +static char *BackendClientHostnameBuffer = NULL; +static char *BackendAppnameBuffer = NULL; +static char *BackendActivityBuffer = NULL; /* @@ -1745,19 +2234,29 @@ BackendStatusShmemSize(void) Size size; size = mul_size(sizeof(PgBackendStatus), MaxBackends); + size = add_size(size, + mul_size(NAMEDATALEN, MaxBackends)); + size = add_size(size, + mul_size(pgstat_track_activity_query_size, MaxBackends)); + size = add_size(size, + mul_size(NAMEDATALEN, MaxBackends)); return size; } /* - * Initialize the shared status array during postmaster startup. + * Initialize the shared status array and several string buffers + * during postmaster startup. */ void CreateSharedBackendStatus(void) { - Size size = BackendStatusShmemSize(); + Size size; bool found; + int i; + char *buffer; /* Create or attach to the shared array */ + size = mul_size(sizeof(PgBackendStatus), MaxBackends); BackendStatusArray = (PgBackendStatus *) ShmemInitStruct("Backend Status Array", size, &found); @@ -1768,31 +2267,104 @@ CreateSharedBackendStatus(void) */ MemSet(BackendStatusArray, 0, size); } + + /* Create or attach to the shared appname buffer */ + size = mul_size(NAMEDATALEN, MaxBackends); + BackendAppnameBuffer = (char *) + ShmemInitStruct("Backend Application Name Buffer", size, &found); + + if (!found) + { + MemSet(BackendAppnameBuffer, 0, size); + + /* Initialize st_appname pointers. */ + buffer = BackendAppnameBuffer; + for (i = 0; i < MaxBackends; i++) + { + BackendStatusArray[i].st_appname = buffer; + buffer += NAMEDATALEN; + } + } + + /* Create or attach to the shared client hostname buffer */ + size = mul_size(NAMEDATALEN, MaxBackends); + BackendClientHostnameBuffer = (char *) + ShmemInitStruct("Backend Client Host Name Buffer", size, &found); + + if (!found) + { + MemSet(BackendClientHostnameBuffer, 0, size); + + /* Initialize st_clienthostname pointers. */ + buffer = BackendClientHostnameBuffer; + for (i = 0; i < MaxBackends; i++) + { + BackendStatusArray[i].st_clienthostname = buffer; + buffer += NAMEDATALEN; + } + } + + /* Create or attach to the shared activity buffer */ + size = mul_size(pgstat_track_activity_query_size, MaxBackends); + BackendActivityBuffer = (char *) + ShmemInitStruct("Backend Activity Buffer", size, &found); + + if (!found) + { + MemSet(BackendActivityBuffer, 0, size); + + /* Initialize st_activity pointers. */ + buffer = BackendActivityBuffer; + for (i = 0; i < MaxBackends; i++) + { + BackendStatusArray[i].st_activity = buffer; + buffer += pgstat_track_activity_query_size; + } + } } +/* ---------- + * pgstat_initialize() - + * + * Initialize pgstats state, and set up our on-proc-exit hook. + * Called from InitPostgres. MyBackendId must be set, + * but we must not have started any transaction yet (since the + * exit hook must run after the last transaction exit). + * NOTE: MyDatabaseId isn't set yet; so the shutdown hook has to be careful. + * ---------- + */ +void +pgstat_initialize(void) +{ + /* Initialize MyBEEntry */ + Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends); + MyBEEntry = &BackendStatusArray[MyBackendId - 1]; + + /* Set up a process-exit hook to clean up */ + on_shmem_exit(pgstat_beshutdown_hook, 0); +} + /* ---------- * pgstat_bestart() - * - * Initialize this backend's entry in the PgBackendStatus array, - * and set up an on-proc-exit hook that will clear it again. - * Called from InitPostgres. MyBackendId and MyDatabaseId must be set. + * Initialize this backend's entry in the PgBackendStatus array. + * Called from InitPostgres. + * MyDatabaseId, session userid, and application_name must be set + * (hence, this cannot be combined with pgstat_initialize). * ---------- */ void pgstat_bestart(void) { - volatile PgBackendStatus *beentry; TimestampTz proc_start_timestamp; Oid userid; SockAddr clientaddr; - - Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends); - MyBEEntry = &BackendStatusArray[MyBackendId - 1]; + volatile PgBackendStatus *beentry; /* - * To minimize the time spent modifying the entry, fetch all the needed - * data first. + * To minimize the time spent modifying the PgBackendStatus entry, fetch + * all the needed data first. * * If we have a MyProcPort, use its session start time (for consistency, * and to save a kernel call). @@ -1828,22 +2400,28 @@ pgstat_bestart(void) beentry->st_procpid = MyProcPid; beentry->st_proc_start_timestamp = proc_start_timestamp; beentry->st_activity_start_timestamp = 0; - beentry->st_txn_start_timestamp = 0; + beentry->st_xact_start_timestamp = 0; beentry->st_databaseid = MyDatabaseId; beentry->st_userid = userid; beentry->st_clientaddr = clientaddr; + beentry->st_clienthostname[0] = '\0'; beentry->st_waiting = false; + beentry->st_appname[0] = '\0'; beentry->st_activity[0] = '\0'; - /* Also make sure the last byte in the string area is always 0 */ - beentry->st_activity[PGBE_ACTIVITY_SIZE - 1] = '\0'; + /* Also make sure the last byte in each string area is always 0 */ + beentry->st_clienthostname[NAMEDATALEN - 1] = '\0'; + beentry->st_appname[NAMEDATALEN - 1] = '\0'; + beentry->st_activity[pgstat_track_activity_query_size - 1] = '\0'; beentry->st_changecount++; Assert((beentry->st_changecount & 1) == 0); - /* - * Set up a process-exit hook to clean up. - */ - on_shmem_exit(pgstat_beshutdown_hook, 0); + if (MyProcPort && MyProcPort->remote_hostname) + strlcpy(beentry->st_clienthostname, MyProcPort->remote_hostname, NAMEDATALEN); + + /* Update app name to current GUC setting */ + if (application_name) + pgstat_report_appname(application_name); } /* @@ -1860,7 +2438,14 @@ pgstat_beshutdown_hook(int code, Datum arg) { volatile PgBackendStatus *beentry = MyBEEntry; - pgstat_report_tabstat(true); + /* + * If we got as far as discovering our own database ID, we can report what + * we did to the collector. Otherwise, we'd be sending an invalid + * database ID, so forget it. (This means that accesses to pg_database + * during failed backend starts might never get counted.) + */ + if (OidIsValid(MyDatabaseId)) + pgstat_report_stat(true); /* * Clear my status entry, following the protocol of bumping st_changecount @@ -1890,7 +2475,9 @@ pgstat_report_activity(const char *cmd_str) TimestampTz start_timestamp; int len; - if (!pgstat_collect_querystring || !beentry) + TRACE_POSTGRESQL_STATEMENT_STATUS(cmd_str); + + if (!pgstat_track_activities || !beentry) return; /* @@ -1900,7 +2487,7 @@ pgstat_report_activity(const char *cmd_str) start_timestamp = GetCurrentStatementStartTimestamp(); len = strlen(cmd_str); - len = pg_mbcliplen(cmd_str, len, PGBE_ACTIVITY_SIZE - 1); + len = pg_mbcliplen(cmd_str, len, pgstat_track_activity_query_size - 1); /* * Update my status entry, following the protocol of bumping @@ -1917,26 +2504,57 @@ pgstat_report_activity(const char *cmd_str) Assert((beentry->st_changecount & 1) == 0); } +/* ---------- + * pgstat_report_appname() - + * + * Called to update our application name. + * ---------- + */ +void +pgstat_report_appname(const char *appname) +{ + volatile PgBackendStatus *beentry = MyBEEntry; + int len; + + if (!beentry) + return; + + /* This should be unnecessary if GUC did its job, but be safe */ + len = pg_mbcliplen(appname, strlen(appname), NAMEDATALEN - 1); + + /* + * Update my status entry, following the protocol of bumping + * st_changecount before and after. We use a volatile pointer here to + * ensure the compiler doesn't try to get cute. + */ + beentry->st_changecount++; + + memcpy((char *) beentry->st_appname, appname, len); + beentry->st_appname[len] = '\0'; + + beentry->st_changecount++; + Assert((beentry->st_changecount & 1) == 0); +} + /* - * Set the current transaction start timestamp to the specified - * value. If there is no current active transaction, this is signified - * by 0. + * Report current transaction start timestamp as the specified value. + * Zero means there is no active transaction. */ void -pgstat_report_txn_timestamp(TimestampTz tstamp) +pgstat_report_xact_timestamp(TimestampTz tstamp) { volatile PgBackendStatus *beentry = MyBEEntry; - if (!pgstat_collect_querystring || !beentry) + if (!pgstat_track_activities || !beentry) return; /* * Update my status entry, following the protocol of bumping - * st_changecount before and after. We use a volatile pointer - * here to ensure the compiler doesn't try to get cute. + * st_changecount before and after. We use a volatile pointer here to + * ensure the compiler doesn't try to get cute. */ beentry->st_changecount++; - beentry->st_txn_start_timestamp = tstamp; + beentry->st_xact_start_timestamp = tstamp; beentry->st_changecount++; Assert((beentry->st_changecount & 1) == 0); } @@ -1955,7 +2573,7 @@ pgstat_report_waiting(bool waiting) { volatile PgBackendStatus *beentry = MyBEEntry; - if (!pgstat_collect_querystring || !beentry) + if (!pgstat_track_activities || !beentry) return; /* @@ -1980,6 +2598,8 @@ pgstat_read_current_status(void) volatile PgBackendStatus *beentry; PgBackendStatus *localtable; PgBackendStatus *localentry; + char *localappname, + *localactivity; int i; Assert(!pgStatRunningInCollector); @@ -1991,6 +2611,12 @@ pgstat_read_current_status(void) localtable = (PgBackendStatus *) MemoryContextAlloc(pgStatLocalContext, sizeof(PgBackendStatus) * MaxBackends); + localappname = (char *) + MemoryContextAlloc(pgStatLocalContext, + NAMEDATALEN * MaxBackends); + localactivity = (char *) + MemoryContextAlloc(pgStatLocalContext, + pgstat_track_activity_query_size * MaxBackends); localNumBackends = 0; beentry = BackendStatusArray; @@ -2008,11 +2634,20 @@ pgstat_read_current_status(void) { int save_changecount = beentry->st_changecount; - /* - * XXX if PGBE_ACTIVITY_SIZE is really large, it might be best to - * use strcpy not memcpy for copying the activity string? - */ - memcpy(localentry, (char *) beentry, sizeof(PgBackendStatus)); + localentry->st_procpid = beentry->st_procpid; + if (localentry->st_procpid > 0) + { + memcpy(localentry, (char *) beentry, sizeof(PgBackendStatus)); + + /* + * strcpy is safe even if the string is modified concurrently, + * because there's always a \0 at the end of the buffer. + */ + strcpy(localappname, (char *) beentry->st_appname); + localentry->st_appname = localappname; + strcpy(localactivity, (char *) beentry->st_activity); + localentry->st_activity = localactivity; + } if (save_changecount == beentry->st_changecount && (save_changecount & 1) == 0) @@ -2027,6 +2662,8 @@ pgstat_read_current_status(void) if (localentry->st_procpid > 0) { localentry++; + localappname += NAMEDATALEN; + localactivity += pgstat_track_activity_query_size; localNumBackends++; } } @@ -2036,6 +2673,80 @@ pgstat_read_current_status(void) } +/* ---------- + * pgstat_get_backend_current_activity() - + * + * Return a string representing the current activity of the backend with + * the specified PID. This looks directly at the BackendStatusArray, + * and so will provide current information regardless of the age of our + * transaction's snapshot of the status array. + * + * It is the caller's responsibility to invoke this only for backends whose + * state is expected to remain stable while the result is in use. The + * only current use is in deadlock reporting, where we can expect that + * the target backend is blocked on a lock. (There are corner cases + * where the target's wait could get aborted while we are looking at it, + * but the very worst consequence is to return a pointer to a string + * that's been changed, so we won't worry too much.) + * + * Note: return strings for special cases match pg_stat_get_backend_activity. + * ---------- + */ +const char * +pgstat_get_backend_current_activity(int pid, bool checkUser) +{ + PgBackendStatus *beentry; + int i; + + beentry = BackendStatusArray; + for (i = 1; i <= MaxBackends; i++) + { + /* + * Although we expect the target backend's entry to be stable, that + * doesn't imply that anyone else's is. To avoid identifying the + * wrong backend, while we check for a match to the desired PID we + * must follow the protocol of retrying if st_changecount changes + * while we examine the entry, or if it's odd. (This might be + * unnecessary, since fetching or storing an int is almost certainly + * atomic, but let's play it safe.) We use a volatile pointer here to + * ensure the compiler doesn't try to get cute. + */ + volatile PgBackendStatus *vbeentry = beentry; + bool found; + + for (;;) + { + int save_changecount = vbeentry->st_changecount; + + found = (vbeentry->st_procpid == pid); + + if (save_changecount == vbeentry->st_changecount && + (save_changecount & 1) == 0) + break; + + /* Make sure we can break out of loop if stuck... */ + CHECK_FOR_INTERRUPTS(); + } + + if (found) + { + /* Now it is safe to use the non-volatile pointer */ + if (checkUser && !superuser() && beentry->st_userid != GetUserId()) + return ""; + else if (*(beentry->st_activity) == '\0') + return ""; + else + return beentry->st_activity; + } + + beentry++; + } + + /* If we get here, caller is in error ... */ + return ""; +} + + /* ------------------------------------------------------------ * Local support functions follow * ------------------------------------------------------------ @@ -2066,7 +2777,7 @@ pgstat_send(void *msg, int len) { int rc; - if (pgStatSock < 0) + if (pgStatSock == PGINVALID_SOCKET) return; ((PgStat_MsgHdr *) msg)->m_size = len; @@ -2087,7 +2798,7 @@ pgstat_send(void *msg, int len) /* ---------- * pgstat_send_bgwriter() - * - * Send bgwriter statistics to the collector + * Send bgwriter statistics to the collector * ---------- */ void @@ -2097,9 +2808,9 @@ pgstat_send_bgwriter(void) static const PgStat_MsgBgWriter all_zeroes; /* - * This function can be called even if nothing at all has happened. - * In this case, avoid sending a completely empty message to - * the stats collector. + * This function can be called even if nothing at all has happened. In + * this case, avoid sending a completely empty message to the stats + * collector. */ if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0) return; @@ -2129,8 +2840,6 @@ pgstat_send_bgwriter(void) NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) { - struct itimerval write_timeout; - bool need_timer = false; int len; PgStat_Msg msg; @@ -2147,11 +2856,13 @@ PgstatCollectorMain(int argc, char *argv[]) MyProcPid = getpid(); /* reset MyProcPid */ + MyStartTime = time(NULL); /* record Start Time for logging */ + /* * If possible, make this process a group leader, so that the postmaster - * can signal any child processes too. (pgstat probably never has - * any child processes, but for consistency we make all postmaster - * child processes do this.) + * can signal any child processes too. (pgstat probably never has any + * child processes, but for consistency we make all postmaster child + * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) @@ -2160,13 +2871,13 @@ PgstatCollectorMain(int argc, char *argv[]) /* * Ignore all signals usually bound to some action in the postmaster, - * except SIGQUIT and SIGALRM. + * except SIGQUIT. */ - pqsignal(SIGHUP, SIG_IGN); + pqsignal(SIGHUP, pgstat_sighup_handler); pqsignal(SIGINT, SIG_IGN); pqsignal(SIGTERM, SIG_IGN); pqsignal(SIGQUIT, pgstat_exit); - pqsignal(SIGALRM, force_statwrite); + pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); pqsignal(SIGUSR2, SIG_IGN); @@ -2185,19 +2896,15 @@ PgstatCollectorMain(int argc, char *argv[]) /* * Arrange to write the initial status file right away */ - need_statwrite = true; - - /* Preset the delay between status file writes */ - MemSet(&write_timeout, 0, sizeof(struct itimerval)); - write_timeout.it_value.tv_sec = PGSTAT_STAT_INTERVAL / 1000; - write_timeout.it_value.tv_usec = (PGSTAT_STAT_INTERVAL % 1000) * 1000; + last_statrequest = GetCurrentTimestamp(); + last_statwrite = last_statrequest - 1; /* * Read in an existing statistics stats file or initialize the stats to * zero. */ pgStatRunningInCollector = true; - pgStatDBHash = pgstat_read_statsfile(InvalidOid); + pgStatDBHash = pgstat_read_statsfile(InvalidOid, true); /* * Setup the descriptor set for select(2). Since only one bit in the set @@ -2212,8 +2919,9 @@ PgstatCollectorMain(int argc, char *argv[]) * death of our parent postmaster. * * For performance reasons, we don't want to do a PostmasterIsAlive() test - * after every message; instead, do it at statwrite time and if - * select()/poll() is interrupted by timeout. + * after every message; instead, do it only when select()/poll() is + * interrupted by timeout. In essence, we'll stay alive as long as + * backends keep sending us stuff often, even if the postmaster is gone. */ for (;;) { @@ -2226,22 +2934,21 @@ PgstatCollectorMain(int argc, char *argv[]) break; /* - * If time to write the stats file, do so. Note that the alarm - * interrupt isn't re-enabled immediately, but only after we next - * receive a stats message; so no cycles are wasted when there is - * nothing going on. + * Reload configuration if we got SIGHUP from the postmaster. */ - if (need_statwrite) + if (got_SIGHUP) { - /* Check for postmaster death; if so we'll write file below */ - if (!PostmasterIsAlive(true)) - break; - - pgstat_write_statsfile(); - need_statwrite = false; - need_timer = true; + ProcessConfigFile(PGC_SIGHUP); + got_SIGHUP = false; } + /* + * Write the stats file if a new request has arrived that is not + * satisfied by existing file. + */ + if (last_statwrite < last_statrequest) + pgstat_write_statsfile(false); + /* * Wait for a message to arrive; but not for more than * PGSTAT_SELECT_TIMEOUT seconds. (This determines how quickly we will @@ -2250,8 +2957,8 @@ PgstatCollectorMain(int argc, char *argv[]) * poll/select call, so this also limits speed of response to SIGQUIT, * which is more important.) * - * We use poll(2) if available, otherwise select(2). - * Win32 has its own implementation. + * We use poll(2) if available, otherwise select(2). Win32 has its own + * implementation. */ #ifndef WIN32 #ifdef HAVE_POLL @@ -2291,9 +2998,9 @@ PgstatCollectorMain(int argc, char *argv[]) got_data = FD_ISSET(pgStatSock, &rfds); #endif /* HAVE_POLL */ -#else /* WIN32 */ +#else /* WIN32 */ got_data = pgwin32_waitforsinglesocket(pgStatSock, FD_READ, - PGSTAT_SELECT_TIMEOUT*1000); + PGSTAT_SELECT_TIMEOUT * 1000); #endif /* @@ -2333,6 +3040,10 @@ PgstatCollectorMain(int argc, char *argv[]) case PGSTAT_MTYPE_DUMMY: break; + case PGSTAT_MTYPE_INQUIRY: + pgstat_recv_inquiry((PgStat_MsgInquiry *) &msg, len); + break; + case PGSTAT_MTYPE_TABSTAT: pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, len); break; @@ -2350,6 +3061,18 @@ PgstatCollectorMain(int argc, char *argv[]) len); break; + case PGSTAT_MTYPE_RESETSHAREDCOUNTER: + pgstat_recv_resetsharedcounter( + (PgStat_MsgResetsharedcounter *) &msg, + len); + break; + + case PGSTAT_MTYPE_RESETSINGLECOUNTER: + pgstat_recv_resetsinglecounter( + (PgStat_MsgResetsinglecounter *) &msg, + len); + break; + case PGSTAT_MTYPE_AUTOVAC_START: pgstat_recv_autovac((PgStat_MsgAutovacStart *) &msg, len); break; @@ -2366,21 +3089,20 @@ PgstatCollectorMain(int argc, char *argv[]) pgstat_recv_bgwriter((PgStat_MsgBgWriter *) &msg, len); break; - default: + case PGSTAT_MTYPE_FUNCSTAT: + pgstat_recv_funcstat((PgStat_MsgFuncstat *) &msg, len); break; - } - /* - * If this is the first message after we wrote the stats file the - * last time, enable the alarm interrupt to make it be written - * again later. - */ - if (need_timer) - { - if (setitimer(ITIMER_REAL, &write_timeout, NULL)) - ereport(ERROR, - (errmsg("could not set statistics collector timer: %m"))); - need_timer = false; + case PGSTAT_MTYPE_FUNCPURGE: + pgstat_recv_funcpurge((PgStat_MsgFuncpurge *) &msg, len); + break; + + case PGSTAT_MTYPE_RECOVERYCONFLICT: + pgstat_recv_recoveryconflict((PgStat_MsgRecoveryConflict *) &msg, len); + break; + + default: + break; } } else @@ -2397,7 +3119,7 @@ PgstatCollectorMain(int argc, char *argv[]) /* * Save the final stats to reuse at next startup. */ - pgstat_write_statsfile(); + pgstat_write_statsfile(true); exit(0); } @@ -2410,11 +3132,11 @@ pgstat_exit(SIGNAL_ARGS) need_exit = true; } -/* SIGALRM signal handler for collector process */ +/* SIGHUP handler for collector process */ static void -force_statwrite(SIGNAL_ARGS) +pgstat_sighup_handler(SIGNAL_ARGS) { - need_statwrite = true; + got_SIGHUP = true; } @@ -2444,6 +3166,7 @@ pgstat_get_db_entry(Oid databaseid, bool create) HASHCTL hash_ctl; result->tables = NULL; + result->functions = NULL; result->n_xact_commit = 0; result->n_xact_rollback = 0; result->n_blocks_fetched = 0; @@ -2454,6 +3177,13 @@ pgstat_get_db_entry(Oid databaseid, bool create) result->n_tuples_updated = 0; result->n_tuples_deleted = 0; result->last_autovac_time = 0; + result->n_conflict_tablespace = 0; + result->n_conflict_lock = 0; + result->n_conflict_snapshot = 0; + result->n_conflict_bufferpin = 0; + result->n_conflict_startup_deadlock = 0; + + result->stat_reset_timestamp = GetCurrentTimestamp(); memset(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(Oid); @@ -2463,6 +3193,63 @@ pgstat_get_db_entry(Oid databaseid, bool create) PGSTAT_TAB_HASH_SIZE, &hash_ctl, HASH_ELEM | HASH_FUNCTION); + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry); + hash_ctl.hash = oid_hash; + result->functions = hash_create("Per-database function", + PGSTAT_FUNCTION_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_FUNCTION); + } + + return result; +} + + +/* + * Lookup the hash table entry for the specified table. If no hash + * table entry exists, initialize it, if the create parameter is true. + * Else, return NULL. + */ +static PgStat_StatTabEntry * +pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create) +{ + PgStat_StatTabEntry *result; + bool found; + HASHACTION action = (create ? HASH_ENTER : HASH_FIND); + + /* Lookup or create the hash table entry for this table */ + result = (PgStat_StatTabEntry *) hash_search(dbentry->tables, + &tableoid, + action, &found); + + if (!create && !found) + return NULL; + + /* If not found, initialize the new one. */ + if (!found) + { + result->numscans = 0; + result->tuples_returned = 0; + result->tuples_fetched = 0; + result->tuples_inserted = 0; + result->tuples_updated = 0; + result->tuples_deleted = 0; + result->tuples_hot_updated = 0; + result->n_live_tuples = 0; + result->n_dead_tuples = 0; + result->changes_since_analyze = 0; + result->blocks_fetched = 0; + result->blocks_hit = 0; + result->vacuum_timestamp = 0; + result->vacuum_count = 0; + result->autovac_vacuum_timestamp = 0; + result->autovac_vacuum_count = 0; + result->analyze_timestamp = 0; + result->analyze_count = 0; + result->autovac_analyze_timestamp = 0; + result->autovac_analyze_count = 0; } return result; @@ -2473,31 +3260,44 @@ pgstat_get_db_entry(Oid databaseid, bool create) * pgstat_write_statsfile() - * * Tell the news. + * If writing to the permanent file (happens when the collector is + * shutting down only), remove the temporary file so that backends + * starting up under a new postmaster can't read the old data before + * the new collector is ready. * ---------- */ static void -pgstat_write_statsfile(void) +pgstat_write_statsfile(bool permanent) { HASH_SEQ_STATUS hstat; HASH_SEQ_STATUS tstat; + HASH_SEQ_STATUS fstat; PgStat_StatDBEntry *dbentry; PgStat_StatTabEntry *tabentry; + PgStat_StatFuncEntry *funcentry; FILE *fpout; int32 format_id; + const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname; + const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; /* * Open the statistics temp file to write out the current values. */ - fpout = fopen(PGSTAT_STAT_TMPFILE, PG_BINARY_W); + fpout = AllocateFile(tmpfile, PG_BINARY_W); if (fpout == NULL) { ereport(LOG, (errcode_for_file_access(), errmsg("could not open temporary statistics file \"%s\": %m", - PGSTAT_STAT_TMPFILE))); + tmpfile))); return; } + /* + * Set the timestamp of the stats file. + */ + globalStats.stats_timestamp = GetCurrentTimestamp(); + /* * Write the file header --- currently just a format ID. */ @@ -2517,8 +3317,8 @@ pgstat_write_statsfile(void) { /* * Write out the DB entry including the number of live backends. We - * don't write the tables pointer since it's of no use to any other - * process. + * don't write the tables or functions pointers, since they're of no + * use to any other process. */ fputc('D', fpout); fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout); @@ -2533,6 +3333,16 @@ pgstat_write_statsfile(void) fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout); } + /* + * Walk through the database's function stats table. + */ + hash_seq_init(&fstat, dbentry->functions); + while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL) + { + fputc('F', fpout); + fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout); + } + /* * Mark the end of this DB */ @@ -2551,26 +3361,58 @@ pgstat_write_statsfile(void) ereport(LOG, (errcode_for_file_access(), errmsg("could not write temporary statistics file \"%s\": %m", - PGSTAT_STAT_TMPFILE))); - fclose(fpout); - unlink(PGSTAT_STAT_TMPFILE); + tmpfile))); + FreeFile(fpout); + unlink(tmpfile); } - else if (fclose(fpout) < 0) + else if (FreeFile(fpout) < 0) { ereport(LOG, (errcode_for_file_access(), errmsg("could not close temporary statistics file \"%s\": %m", - PGSTAT_STAT_TMPFILE))); - unlink(PGSTAT_STAT_TMPFILE); + tmpfile))); + unlink(tmpfile); } - else if (rename(PGSTAT_STAT_TMPFILE, PGSTAT_STAT_FILENAME) < 0) + else if (rename(tmpfile, statfile) < 0) { ereport(LOG, (errcode_for_file_access(), errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", - PGSTAT_STAT_TMPFILE, PGSTAT_STAT_FILENAME))); - unlink(PGSTAT_STAT_TMPFILE); + tmpfile, statfile))); + unlink(tmpfile); } + else + { + /* + * Successful write, so update last_statwrite. + */ + last_statwrite = globalStats.stats_timestamp; + + /* + * If there is clock skew between backends and the collector, we could + * receive a stats request time that's in the future. If so, complain + * and reset last_statrequest. Resetting ensures that no inquiry + * message can cause more than one stats file write to occur. + */ + if (last_statrequest > last_statwrite) + { + char *reqtime; + char *mytime; + + /* Copy because timestamptz_to_str returns a static buffer */ + reqtime = pstrdup(timestamptz_to_str(last_statrequest)); + mytime = pstrdup(timestamptz_to_str(last_statwrite)); + elog(LOG, "last_statrequest %s is later than collector's time %s", + reqtime, mytime); + pfree(reqtime); + pfree(mytime); + + last_statrequest = last_statwrite; + } + } + + if (permanent) + unlink(pgstat_stat_filename); } @@ -2582,18 +3424,22 @@ pgstat_write_statsfile(void) * ---------- */ static HTAB * -pgstat_read_statsfile(Oid onlydb) +pgstat_read_statsfile(Oid onlydb, bool permanent) { PgStat_StatDBEntry *dbentry; PgStat_StatDBEntry dbbuf; PgStat_StatTabEntry *tabentry; PgStat_StatTabEntry tabbuf; + PgStat_StatFuncEntry funcbuf; + PgStat_StatFuncEntry *funcentry; HASHCTL hash_ctl; HTAB *dbhash; HTAB *tabhash = NULL; + HTAB *funchash = NULL; FILE *fpin; int32 format_id; bool found; + const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; /* * The tables will live in pgStatLocalContext. @@ -2617,13 +3463,30 @@ pgstat_read_statsfile(Oid onlydb) */ memset(&globalStats, 0, sizeof(globalStats)); + /* + * Set the current timestamp (will be kept only in case we can't load an + * existing statsfile. + */ + globalStats.stat_reset_timestamp = GetCurrentTimestamp(); + /* * Try to open the status file. If it doesn't exist, the backends simply * return zero for anything and the collector simply starts from scratch * with empty counters. + * + * ENOENT is a possibility if the stats collector is not running or has + * not yet written the stats file the first time. Any other failure + * condition is suspicious. */ - if ((fpin = AllocateFile(PGSTAT_STAT_FILENAME, PG_BINARY_R)) == NULL) + if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) + { + if (errno != ENOENT) + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errcode_for_file_access(), + errmsg("could not open statistics file \"%s\": %m", + statfile))); return dbhash; + } /* * Verify it's of the expected format. @@ -2632,7 +3495,7 @@ pgstat_read_statsfile(Oid onlydb) || format_id != PGSTAT_FILE_FORMAT_ID) { ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted pgstat.stat file"))); + (errmsg("corrupted statistics file \"%s\"", statfile))); goto done; } @@ -2642,7 +3505,7 @@ pgstat_read_statsfile(Oid onlydb) if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats)) { ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted pgstat.stat file"))); + (errmsg("corrupted statistics file \"%s\"", statfile))); goto done; } @@ -2656,15 +3519,16 @@ pgstat_read_statsfile(Oid onlydb) { /* * 'D' A PgStat_StatDBEntry struct describing a database - * follows. Subsequently, zero to many 'T' entries will follow - * until a 'd' is encountered. + * follows. Subsequently, zero to many 'T' and 'F' entries + * will follow until a 'd' is encountered. */ case 'D': if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables), fpin) != offsetof(PgStat_StatDBEntry, tables)) { ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted pgstat.stat file"))); + (errmsg("corrupted statistics file \"%s\"", + statfile))); goto done; } @@ -2678,12 +3542,14 @@ pgstat_read_statsfile(Oid onlydb) if (found) { ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted pgstat.stat file"))); + (errmsg("corrupted statistics file \"%s\"", + statfile))); goto done; } memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry)); dbentry->tables = NULL; + dbentry->functions = NULL; /* * Don't collect tables if not the requested DB (or the @@ -2704,13 +3570,23 @@ pgstat_read_statsfile(Oid onlydb) dbentry->tables = hash_create("Per-database table", PGSTAT_TAB_HASH_SIZE, &hash_ctl, - HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry); + hash_ctl.hash = oid_hash; + hash_ctl.hcxt = pgStatLocalContext; + dbentry->functions = hash_create("Per-database function", + PGSTAT_FUNCTION_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); /* - * Arrange that following 'T's add entries to this database's - * tables hash table. + * Arrange that following records add entries to this + * database's hash tables. */ tabhash = dbentry->tables; + funchash = dbentry->functions; break; /* @@ -2718,6 +3594,7 @@ pgstat_read_statsfile(Oid onlydb) */ case 'd': tabhash = NULL; + funchash = NULL; break; /* @@ -2728,7 +3605,8 @@ pgstat_read_statsfile(Oid onlydb) fpin) != sizeof(PgStat_StatTabEntry)) { ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted pgstat.stat file"))); + (errmsg("corrupted statistics file \"%s\"", + statfile))); goto done; } @@ -2745,13 +3623,48 @@ pgstat_read_statsfile(Oid onlydb) if (found) { ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted pgstat.stat file"))); + (errmsg("corrupted statistics file \"%s\"", + statfile))); goto done; } memcpy(tabentry, &tabbuf, sizeof(tabbuf)); break; + /* + * 'F' A PgStat_StatFuncEntry follows. + */ + case 'F': + if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry), + fpin) != sizeof(PgStat_StatFuncEntry)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + goto done; + } + + /* + * Skip if function belongs to a not requested database. + */ + if (funchash == NULL) + break; + + funcentry = (PgStat_StatFuncEntry *) hash_search(funchash, + (void *) &funcbuf.functionid, + HASH_ENTER, &found); + + if (found) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + goto done; + } + + memcpy(funcentry, &funcbuf, sizeof(funcbuf)); + break; + /* * 'E' The EOF marker of a complete stats file. */ @@ -2760,7 +3673,8 @@ pgstat_read_statsfile(Oid onlydb) default: ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted pgstat.stat file"))); + (errmsg("corrupted statistics file \"%s\"", + statfile))); goto done; } } @@ -2768,9 +3682,70 @@ pgstat_read_statsfile(Oid onlydb) done: FreeFile(fpin); + if (permanent) + unlink(PGSTAT_STAT_PERMANENT_FILENAME); + return dbhash; } +/* ---------- + * pgstat_read_statsfile_timestamp() - + * + * Attempt to fetch the timestamp of an existing stats file. + * Returns TRUE if successful (timestamp is stored at *ts). + * ---------- + */ +static bool +pgstat_read_statsfile_timestamp(bool permanent, TimestampTz *ts) +{ + PgStat_GlobalStats myGlobalStats; + FILE *fpin; + int32 format_id; + const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; + + /* + * Try to open the status file. As above, anything but ENOENT is worthy + * of complaining about. + */ + if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) + { + if (errno != ENOENT) + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errcode_for_file_access(), + errmsg("could not open statistics file \"%s\": %m", + statfile))); + return false; + } + + /* + * Verify it's of the expected format. + */ + if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) + || format_id != PGSTAT_FILE_FORMAT_ID) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + FreeFile(fpin); + return false; + } + + /* + * Read global stats struct + */ + if (fread(&myGlobalStats, 1, sizeof(myGlobalStats), fpin) != sizeof(myGlobalStats)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + FreeFile(fpin); + return false; + } + + *ts = myGlobalStats.stats_timestamp; + + FreeFile(fpin); + return true; +} + /* * If not already done, read the statistics collector stats file into * some hash tables. The results will be kept until pgstat_clear_snapshot() @@ -2779,16 +3754,63 @@ done: static void backend_read_statsfile(void) { + TimestampTz min_ts; + int count; + /* already read it? */ if (pgStatDBHash) return; Assert(!pgStatRunningInCollector); + /* + * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL msec + * before now. This indirectly ensures that the collector needn't write + * the file more often than PGSTAT_STAT_INTERVAL. In an autovacuum + * worker, however, we want a lower delay to avoid using stale data, so we + * use PGSTAT_RETRY_DELAY (since the number of worker is low, this + * shouldn't be a problem). + * + * Note that we don't recompute min_ts after sleeping; so we might end up + * accepting a file a bit older than PGSTAT_STAT_INTERVAL. In practice + * that shouldn't happen, though, as long as the sleep time is less than + * PGSTAT_STAT_INTERVAL; and we don't want to lie to the collector about + * what our cutoff time really is. + */ + if (IsAutoVacuumWorkerProcess()) + min_ts = TimestampTzPlusMilliseconds(GetCurrentTimestamp(), + -PGSTAT_RETRY_DELAY); + else + min_ts = TimestampTzPlusMilliseconds(GetCurrentTimestamp(), + -PGSTAT_STAT_INTERVAL); + + /* + * Loop until fresh enough stats file is available or we ran out of time. + * The stats inquiry message is sent repeatedly in case collector drops + * it. + */ + for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++) + { + TimestampTz file_ts = 0; + + CHECK_FOR_INTERRUPTS(); + + if (pgstat_read_statsfile_timestamp(false, &file_ts) && + file_ts >= min_ts) + break; + + /* Not there or too old, so kick the collector and wait a bit */ + pgstat_send_inquiry(min_ts); + pg_usleep(PGSTAT_RETRY_DELAY * 1000L); + } + + if (count >= PGSTAT_POLL_LOOP_COUNT) + elog(WARNING, "pgstat wait timeout"); + /* Autovacuum launcher wants stats about all databases */ if (IsAutoVacuumLauncherProcess()) - pgStatDBHash = pgstat_read_statsfile(InvalidOid); + pgStatDBHash = pgstat_read_statsfile(InvalidOid, false); else - pgStatDBHash = pgstat_read_statsfile(MyDatabaseId); + pgStatDBHash = pgstat_read_statsfile(MyDatabaseId, false); } @@ -2813,7 +3835,7 @@ pgstat_setup_memcxt(void) /* ---------- * pgstat_clear_snapshot() - * - * Discard any data collected in the current transaction. Any subsequent + * Discard any data collected in the current transaction. Any subsequent * request will cause new snapshots to be read. * * This is also invoked during transaction commit or abort to discard @@ -2835,6 +3857,20 @@ pgstat_clear_snapshot(void) } +/* ---------- + * pgstat_recv_inquiry() - + * + * Process stat inquiry requests. + * ---------- + */ +static void +pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len) +{ + if (msg->inquiry_time > last_statrequest) + last_statrequest = msg->inquiry_time; +} + + /* ---------- * pgstat_recv_tabstat() - * @@ -2844,7 +3880,6 @@ pgstat_clear_snapshot(void) static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) { - PgStat_TableEntry *tabmsg = &(msg->m_entry[0]); PgStat_StatDBEntry *dbentry; PgStat_StatTabEntry *tabentry; int i; @@ -2863,8 +3898,10 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) */ for (i = 0; i < msg->m_nentries; i++) { + PgStat_TableEntry *tabmsg = &(msg->m_entry[i]); + tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - (void *) &(tabmsg[i].t_id), + (void *) &(tabmsg->t_id), HASH_ENTER, &found); if (!found) @@ -2873,50 +3910,62 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) * If it's a new table entry, initialize counters to the values we * just got. */ - tabentry->numscans = tabmsg[i].t_counts.t_numscans; - tabentry->tuples_returned = tabmsg[i].t_counts.t_tuples_returned; - tabentry->tuples_fetched = tabmsg[i].t_counts.t_tuples_fetched; - tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted; - tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated; - tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted; - tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples; - tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples; - tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched; - tabentry->blocks_hit = tabmsg[i].t_counts.t_blocks_hit; - - tabentry->last_anl_tuples = 0; + tabentry->numscans = tabmsg->t_counts.t_numscans; + tabentry->tuples_returned = tabmsg->t_counts.t_tuples_returned; + tabentry->tuples_fetched = tabmsg->t_counts.t_tuples_fetched; + tabentry->tuples_inserted = tabmsg->t_counts.t_tuples_inserted; + tabentry->tuples_updated = tabmsg->t_counts.t_tuples_updated; + tabentry->tuples_deleted = tabmsg->t_counts.t_tuples_deleted; + tabentry->tuples_hot_updated = tabmsg->t_counts.t_tuples_hot_updated; + tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples; + tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples; + tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples; + tabentry->blocks_fetched = tabmsg->t_counts.t_blocks_fetched; + tabentry->blocks_hit = tabmsg->t_counts.t_blocks_hit; + tabentry->vacuum_timestamp = 0; + tabentry->vacuum_count = 0; tabentry->autovac_vacuum_timestamp = 0; + tabentry->autovac_vacuum_count = 0; tabentry->analyze_timestamp = 0; + tabentry->analyze_count = 0; tabentry->autovac_analyze_timestamp = 0; + tabentry->autovac_analyze_count = 0; } else { /* * Otherwise add the values to the existing entry. */ - tabentry->numscans += tabmsg[i].t_counts.t_numscans; - tabentry->tuples_returned += tabmsg[i].t_counts.t_tuples_returned; - tabentry->tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched; - tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted; - tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated; - tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted; - tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples; - tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples; - tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched; - tabentry->blocks_hit += tabmsg[i].t_counts.t_blocks_hit; + tabentry->numscans += tabmsg->t_counts.t_numscans; + tabentry->tuples_returned += tabmsg->t_counts.t_tuples_returned; + tabentry->tuples_fetched += tabmsg->t_counts.t_tuples_fetched; + tabentry->tuples_inserted += tabmsg->t_counts.t_tuples_inserted; + tabentry->tuples_updated += tabmsg->t_counts.t_tuples_updated; + tabentry->tuples_deleted += tabmsg->t_counts.t_tuples_deleted; + tabentry->tuples_hot_updated += tabmsg->t_counts.t_tuples_hot_updated; + tabentry->n_live_tuples += tabmsg->t_counts.t_delta_live_tuples; + tabentry->n_dead_tuples += tabmsg->t_counts.t_delta_dead_tuples; + tabentry->changes_since_analyze += tabmsg->t_counts.t_changed_tuples; + tabentry->blocks_fetched += tabmsg->t_counts.t_blocks_fetched; + tabentry->blocks_hit += tabmsg->t_counts.t_blocks_hit; } + /* Clamp n_live_tuples in case of negative delta_live_tuples */ + tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0); + /* Likewise for n_dead_tuples */ + tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0); + /* * Add per-table stats to the per-database entry, too. */ - dbentry->n_tuples_returned += tabmsg[i].t_counts.t_tuples_returned; - dbentry->n_tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched; - dbentry->n_tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted; - dbentry->n_tuples_updated += tabmsg[i].t_counts.t_tuples_updated; - dbentry->n_tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted; - dbentry->n_blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched; - dbentry->n_blocks_hit += tabmsg[i].t_counts.t_blocks_hit; + dbentry->n_tuples_returned += tabmsg->t_counts.t_tuples_returned; + dbentry->n_tuples_fetched += tabmsg->t_counts.t_tuples_fetched; + dbentry->n_tuples_inserted += tabmsg->t_counts.t_tuples_inserted; + dbentry->n_tuples_updated += tabmsg->t_counts.t_tuples_updated; + dbentry->n_tuples_deleted += tabmsg->t_counts.t_tuples_deleted; + dbentry->n_blocks_fetched += tabmsg->t_counts.t_blocks_fetched; + dbentry->n_blocks_hit += tabmsg->t_counts.t_blocks_hit; } } @@ -2977,6 +4026,8 @@ pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len) { if (dbentry->tables != NULL) hash_destroy(dbentry->tables); + if (dbentry->functions != NULL) + hash_destroy(dbentry->functions); if (hash_search(pgStatDBHash, (void *) &(dbentry->databaseid), @@ -3014,12 +4065,28 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len) */ if (dbentry->tables != NULL) hash_destroy(dbentry->tables); + if (dbentry->functions != NULL) + hash_destroy(dbentry->functions); dbentry->tables = NULL; + dbentry->functions = NULL; + + /* + * Reset database-level stats too. This should match the initialization + * code in pgstat_get_db_entry(). + */ dbentry->n_xact_commit = 0; dbentry->n_xact_rollback = 0; dbentry->n_blocks_fetched = 0; dbentry->n_blocks_hit = 0; + dbentry->n_tuples_returned = 0; + dbentry->n_tuples_fetched = 0; + dbentry->n_tuples_inserted = 0; + dbentry->n_tuples_updated = 0; + dbentry->n_tuples_deleted = 0; + dbentry->last_autovac_time = 0; + + dbentry->stat_reset_timestamp = GetCurrentTimestamp(); memset(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(Oid); @@ -3029,33 +4096,82 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len) PGSTAT_TAB_HASH_SIZE, &hash_ctl, HASH_ELEM | HASH_FUNCTION); + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry); + hash_ctl.hash = oid_hash; + dbentry->functions = hash_create("Per-database function", + PGSTAT_FUNCTION_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_FUNCTION); } /* ---------- - * pgstat_recv_autovac() - + * pgstat_recv_resetshared() - * - * Process an autovacuum signalling message. + * Reset some shared statistics of the cluster. * ---------- */ static void -pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len) +pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len) { - PgStat_StatDBEntry *dbentry; + if (msg->m_resettarget == RESET_BGWRITER) + { + /* Reset the global background writer statistics for the cluster. */ + memset(&globalStats, 0, sizeof(globalStats)); + globalStats.stat_reset_timestamp = GetCurrentTimestamp(); + } /* - * Lookup the database in the hashtable. Don't create the entry if it - * doesn't exist, because autovacuum may be processing a template - * database. If this isn't the case, the database is most likely to have - * an entry already. (If it doesn't, not much harm is done anyway -- - * it'll get created as soon as somebody actually uses the database.) + * Presumably the sender of this message validated the target, don't + * complain here if it's not valid */ +} + +/* ---------- + * pgstat_recv_resetsinglecounter() - + * + * Reset a statistics for a single object + * ---------- + */ +static void +pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + dbentry = pgstat_get_db_entry(msg->m_databaseid, false); - if (dbentry == NULL) + + if (!dbentry) return; + /* Set the reset timestamp for the whole database */ + dbentry->stat_reset_timestamp = GetCurrentTimestamp(); + + /* Remove object if it exists, ignore it if not */ + if (msg->m_resettype == RESET_TABLE) + (void) hash_search(dbentry->tables, (void *) &(msg->m_objectid), + HASH_REMOVE, NULL); + else if (msg->m_resettype == RESET_FUNCTION) + (void) hash_search(dbentry->functions, (void *) &(msg->m_objectid), + HASH_REMOVE, NULL); +} + +/* ---------- + * pgstat_recv_autovac() - + * + * Process an autovacuum signalling message. + * ---------- + */ +static void +pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + /* - * Store the last autovacuum time in the database entry. + * Store the last autovacuum time in the database's hashtable entry. */ + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + dbentry->last_autovac_time = msg->m_start_time; } @@ -3072,38 +4188,25 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len) PgStat_StatTabEntry *tabentry; /* - * Don't create either the database or table entry if it doesn't already - * exist. This avoids bloating the stats with entries for stuff that is - * only touched by vacuum and not by live operations. + * Store the data in the table's hashtable entry. */ - dbentry = pgstat_get_db_entry(msg->m_databaseid, false); - if (dbentry == NULL) - return; + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - tabentry = hash_search(dbentry->tables, &(msg->m_tableoid), - HASH_FIND, NULL); - if (tabentry == NULL) - return; + tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); - if (msg->m_autovacuum) - tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime; - else - tabentry->vacuum_timestamp = msg->m_vacuumtime; tabentry->n_live_tuples = msg->m_tuples; + /* Resetting dead_tuples to 0 is an approximation ... */ tabentry->n_dead_tuples = 0; - if (msg->m_analyze) + + if (msg->m_autovacuum) { - tabentry->last_anl_tuples = msg->m_tuples; - if (msg->m_autovacuum) - tabentry->autovac_analyze_timestamp = msg->m_vacuumtime; - else - tabentry->analyze_timestamp = msg->m_vacuumtime; + tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime; + tabentry->autovac_vacuum_count++; } else { - /* last_anl_tuples must never exceed n_live_tuples */ - tabentry->last_anl_tuples = Min(tabentry->last_anl_tuples, - msg->m_tuples); + tabentry->vacuum_timestamp = msg->m_vacuumtime; + tabentry->vacuum_count++; } } @@ -3120,26 +4223,31 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len) PgStat_StatTabEntry *tabentry; /* - * Don't create either the database or table entry if it doesn't already - * exist. This avoids bloating the stats with entries for stuff that is - * only touched by analyze and not by live operations. + * Store the data in the table's hashtable entry. */ - dbentry = pgstat_get_db_entry(msg->m_databaseid, false); - if (dbentry == NULL) - return; + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - tabentry = hash_search(dbentry->tables, &(msg->m_tableoid), - HASH_FIND, NULL); - if (tabentry == NULL) - return; + tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); + + tabentry->n_live_tuples = msg->m_live_tuples; + tabentry->n_dead_tuples = msg->m_dead_tuples; + + /* + * We reset changes_since_analyze to zero, forgetting any changes that + * occurred while the ANALYZE was in progress. + */ + tabentry->changes_since_analyze = 0; if (msg->m_autovacuum) + { tabentry->autovac_analyze_timestamp = msg->m_analyzetime; + tabentry->autovac_analyze_count++; + } else + { tabentry->analyze_timestamp = msg->m_analyzetime; - tabentry->n_live_tuples = msg->m_live_tuples; - tabentry->n_dead_tuples = msg->m_dead_tuples; - tabentry->last_anl_tuples = msg->m_live_tuples + msg->m_dead_tuples; + tabentry->analyze_count++; + } } @@ -3155,8 +4263,129 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len) globalStats.timed_checkpoints += msg->m_timed_checkpoints; globalStats.requested_checkpoints += msg->m_requested_checkpoints; globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints; - globalStats.buf_written_lru += msg->m_buf_written_lru; - globalStats.buf_written_all += msg->m_buf_written_all; - globalStats.maxwritten_lru += msg->m_maxwritten_lru; - globalStats.maxwritten_all += msg->m_maxwritten_all; + globalStats.buf_written_clean += msg->m_buf_written_clean; + globalStats.maxwritten_clean += msg->m_maxwritten_clean; + globalStats.buf_written_backend += msg->m_buf_written_backend; + globalStats.buf_fsync_backend += msg->m_buf_fsync_backend; + globalStats.buf_alloc += msg->m_buf_alloc; +} + +/* ---------- + * pgstat_recv_recoveryconflict() - + * + * Process as RECOVERYCONFLICT message. + * ---------- + */ +static void +pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + switch (msg->m_reason) + { + case PROCSIG_RECOVERY_CONFLICT_DATABASE: + + /* + * Since we drop the information about the database as soon as it + * replicates, there is no point in counting these conflicts. + */ + break; + case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + dbentry->n_conflict_tablespace++; + break; + case PROCSIG_RECOVERY_CONFLICT_LOCK: + dbentry->n_conflict_lock++; + break; + case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + dbentry->n_conflict_snapshot++; + break; + case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + dbentry->n_conflict_bufferpin++; + break; + case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + dbentry->n_conflict_startup_deadlock++; + break; + } +} + +/* ---------- + * pgstat_recv_funcstat() - + * + * Count what the backend has done. + * ---------- + */ +static void +pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len) +{ + PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]); + PgStat_StatDBEntry *dbentry; + PgStat_StatFuncEntry *funcentry; + int i; + bool found; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + /* + * Process all function entries in the message. + */ + for (i = 0; i < msg->m_nentries; i++, funcmsg++) + { + funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions, + (void *) &(funcmsg->f_id), + HASH_ENTER, &found); + + if (!found) + { + /* + * If it's a new function entry, initialize counters to the values + * we just got. + */ + funcentry->f_numcalls = funcmsg->f_numcalls; + funcentry->f_time = funcmsg->f_time; + funcentry->f_time_self = funcmsg->f_time_self; + } + else + { + /* + * Otherwise add the values to the existing entry. + */ + funcentry->f_numcalls += funcmsg->f_numcalls; + funcentry->f_time += funcmsg->f_time; + funcentry->f_time_self += funcmsg->f_time_self; + } + } +} + +/* ---------- + * pgstat_recv_funcpurge() - + * + * Arrange for dead function removal. + * ---------- + */ +static void +pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + int i; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, false); + + /* + * No need to purge if we don't even know the database. + */ + if (!dbentry || !dbentry->functions) + return; + + /* + * Process all function entries in the message. + */ + for (i = 0; i < msg->m_nentries; i++) + { + /* Remove from hashtable if present; we don't care if it's not. */ + (void) hash_search(dbentry->functions, + (void *) &(msg->m_functionid[i]), + HASH_REMOVE, NULL); + } }