--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * pg_stat_statements.c
+ * Track statement execution times across a whole database cluster.
+ *
+ * Note about locking issues: to create or delete an entry in the shared
+ * hashtable, one must hold pgss->lock exclusively. Modifying any field
+ * in an entry except the counters requires the same. To look up an entry,
+ * one must hold the lock shared. To read or update the counters within
+ * an entry, one must hold the lock shared or exclusive (so the entry doesn't
+ * disappear!) and also take the entry's mutex spinlock.
+ *
+ *
+ * Copyright (c) 2008-2009, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/contrib/pg_stat_statements/pg_stat_statements.c,v 1.1 2009/01/04 22:19:59 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "access/hash.h"
+#include "catalog/pg_type.h"
+#include "executor/executor.h"
+#include "executor/instrument.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+#include "utils/hsearch.h"
+#include "utils/guc.h"
+
+
+PG_MODULE_MAGIC;
+
+/* Location of stats file */
+#define PGSS_DUMP_FILE "global/pg_stat_statements.stat"
+
+/* This constant defines the magic number in the stats file header */
+static const uint32 PGSS_FILE_HEADER = 0x20081202;
+
+/* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
+#define USAGE_EXEC(duration) (1.0)
+#define USAGE_INIT (1.0) /* including initial planning */
+#define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */
+#define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */
+
+/*
+ * Hashtable key that defines the identity of a hashtable entry. The
+ * hash comparators do not assume that the query string is null-terminated;
+ * this lets us search for an mbcliplen'd string without copying it first.
+ *
+ * Presently, the query encoding is fully determined by the source database
+ * and so we don't really need it to be in the key. But that might not always
+ * be true. Anyway it's notationally convenient to pass it as part of the key.
+ */
+typedef struct pgssHashKey
+{
+ Oid userid; /* user OID */
+ Oid dbid; /* database OID */
+ int encoding; /* query encoding */
+ int query_len; /* # of valid bytes in query string */
+ const char *query_ptr; /* query string proper */
+} pgssHashKey;
+
+/*
+ * The actual stats counters kept within pgssEntry.
+ */
+typedef struct Counters
+{
+ int64 calls; /* # of times executed */
+ double total_time; /* total execution time in seconds */
+ int64 rows; /* total # of retrieved or affected rows */
+ double usage; /* usage factor */
+} Counters;
+
+/*
+ * Statistics per statement
+ *
+ * NB: see the file read/write code before changing field order here.
+ */
+typedef struct pgssEntry
+{
+ pgssHashKey key; /* hash key of entry - MUST BE FIRST */
+ Counters counters; /* the statistics for this query */
+ slock_t mutex; /* protects the counters only */
+ char query[1]; /* VARIABLE LENGTH ARRAY - MUST BE LAST */
+ /* Note: the allocated length of query[] is actually pgss->query_size */
+} pgssEntry;
+
+/*
+ * Global shared state
+ */
+typedef struct pgssSharedState
+{
+ LWLockId lock; /* protects hashtable search/modification */
+ int query_size; /* max query length in bytes */
+} pgssSharedState;
+
+/*---- Local variables ----*/
+
+/* Current nesting depth of ExecutorRun calls */
+static int nested_level = 0;
+/* Saved hook values in case of unload */
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+static ExecutorStart_hook_type prev_ExecutorStart = NULL;
+static ExecutorRun_hook_type prev_ExecutorRun = NULL;
+static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
+/* Links to shared memory state */
+static pgssSharedState *pgss = NULL;
+static HTAB *pgss_hash = NULL;
+
+/*---- GUC variables ----*/
+
+typedef enum
+{
+ PGSS_TRACK_NONE, /* track no statements */
+ PGSS_TRACK_TOP, /* only top level statements */
+ PGSS_TRACK_ALL, /* all statements, including nested ones */
+} PGSSTrackLevel;
+
+static const struct config_enum_entry track_options[] = {
+ {"none", PGSS_TRACK_NONE, false},
+ {"top", PGSS_TRACK_TOP, false},
+ {"all", PGSS_TRACK_ALL, false},
+ {NULL, 0, false}
+};
+
+static int pgss_max; /* max # statements to track */
+static int pgss_track; /* tracking level */
+static bool pgss_save; /* whether to save stats across shutdown */
+
+
+#define pgss_enabled() \
+ (pgss_track == PGSS_TRACK_ALL || \
+ (pgss_track == PGSS_TRACK_TOP && nested_level == 0))
+
+/*---- Function declarations ----*/
+
+void _PG_init(void);
+void _PG_fini(void);
+
+Datum pg_stat_statements_reset(PG_FUNCTION_ARGS);
+Datum pg_stat_statements(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(pg_stat_statements_reset);
+PG_FUNCTION_INFO_V1(pg_stat_statements);
+
+static void pgss_shmem_startup(void);
+static void pgss_shmem_shutdown(int code, Datum arg);
+static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
+static void pgss_ExecutorRun(QueryDesc *queryDesc,
+ ScanDirection direction,
+ long count);
+static void pgss_ExecutorEnd(QueryDesc *queryDesc);
+static uint32 pgss_hash_fn(const void *key, Size keysize);
+static int pgss_match_fn(const void *key1, const void *key2, Size keysize);
+static void pgss_store(const char *query,
+ const Instrumentation *instr, uint32 rows);
+static Size pgss_memsize(void);
+static pgssEntry *entry_alloc(pgssHashKey *key);
+static void entry_dealloc(void);
+static void entry_reset(void);
+
+
+/*
+ * Module load callback
+ */
+void
+_PG_init(void)
+{
+ /*
+ * In order to create our shared memory area, we have to be loaded via
+ * shared_preload_libraries. If not, fall out without hooking into
+ * any of the main system. (We don't throw error here because it seems
+ * useful to allow the pg_stat_statements functions to be created even
+ * when the module isn't active. The functions must protect themselves
+ * against being called then, however.)
+ */
+ if (!process_shared_preload_libraries_in_progress)
+ return;
+
+ /*
+ * Define (or redefine) custom GUC variables.
+ */
+ DefineCustomIntVariable("pg_stat_statements.max",
+ "Sets the maximum number of statements tracked by pg_stat_statements.",
+ NULL,
+ &pgss_max,
+ 1000,
+ 100,
+ INT_MAX,
+ PGC_POSTMASTER,
+ 0,
+ NULL,
+ NULL);
+
+ DefineCustomEnumVariable("pg_stat_statements.track",
+ "Selects which statements are tracked by pg_stat_statements.",
+ NULL,
+ &pgss_track,
+ PGSS_TRACK_TOP,
+ track_options,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL);
+
+ DefineCustomBoolVariable("pg_stat_statements.save",
+ "Save pg_stat_statements statistics across server shutdowns.",
+ NULL,
+ &pgss_save,
+ true,
+ PGC_SIGHUP,
+ 0,
+ NULL,
+ NULL);
+
+ /*
+ * Request additional shared resources. (These are no-ops if we're not in
+ * the postmaster process.) We'll allocate or attach to the shared
+ * resources in pgss_shmem_startup().
+ */
+ RequestAddinShmemSpace(pgss_memsize());
+ RequestAddinLWLocks(1);
+
+ /*
+ * Install hooks.
+ */
+ prev_shmem_startup_hook = shmem_startup_hook;
+ shmem_startup_hook = pgss_shmem_startup;
+ prev_ExecutorStart = ExecutorStart_hook;
+ ExecutorStart_hook = pgss_ExecutorStart;
+ prev_ExecutorRun = ExecutorRun_hook;
+ ExecutorRun_hook = pgss_ExecutorRun;
+ prev_ExecutorEnd = ExecutorEnd_hook;
+ ExecutorEnd_hook = pgss_ExecutorEnd;
+}
+
+/*
+ * Module unload callback
+ */
+void
+_PG_fini(void)
+{
+ /* Uninstall hooks. */
+ ExecutorStart_hook = prev_ExecutorStart;
+ ExecutorRun_hook = prev_ExecutorRun;
+ ExecutorEnd_hook = prev_ExecutorEnd;
+ shmem_startup_hook = prev_shmem_startup_hook;
+}
+
+/*
+ * shmem_startup hook: allocate or attach to shared memory,
+ * then load any pre-existing statistics from file.
+ */
+static void
+pgss_shmem_startup(void)
+{
+ bool found;
+ HASHCTL info;
+ FILE *file;
+ uint32 header;
+ int32 num;
+ int32 i;
+ int query_size;
+ int buffer_size;
+ char *buffer = NULL;
+
+ if (prev_shmem_startup_hook)
+ prev_shmem_startup_hook();
+
+ /* reset in case this is a restart within the postmaster */
+ pgss = NULL;
+ pgss_hash = NULL;
+
+ /*
+ * Create or attach to the shared memory state, including hash table
+ */
+ LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+ pgss = ShmemInitStruct("pg_stat_statements",
+ sizeof(pgssSharedState),
+ &found);
+ if (!pgss)
+ elog(ERROR, "out of shared memory");
+
+ if (!found)
+ {
+ /* First time through ... */
+ pgss->lock = LWLockAssign();
+ pgss->query_size = pgstat_track_activity_query_size;
+ }
+
+ /* Be sure everyone agrees on the hash table entry size */
+ query_size = pgss->query_size;
+
+ memset(&info, 0, sizeof(info));
+ info.keysize = sizeof(pgssHashKey);
+ info.entrysize = offsetof(pgssEntry, query) + query_size;
+ info.hash = pgss_hash_fn;
+ info.match = pgss_match_fn;
+ pgss_hash = ShmemInitHash("pg_stat_statements hash",
+ pgss_max, pgss_max,
+ &info,
+ HASH_ELEM | HASH_FUNCTION | HASH_COMPARE);
+ if (!pgss_hash)
+ elog(ERROR, "out of shared memory");
+
+ LWLockRelease(AddinShmemInitLock);
+
+ /*
+ * If we're in the postmaster (or a standalone backend...), set up a
+ * shmem exit hook to dump the statistics to disk.
+ */
+ if (!IsUnderPostmaster)
+ on_shmem_exit(pgss_shmem_shutdown, (Datum) 0);
+
+ /*
+ * Attempt to load old statistics from the dump file.
+ *
+ * Note: we don't bother with locks here, because there should be no
+ * other processes running when this is called.
+ */
+ if (!pgss_save)
+ return;
+
+ file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R);
+ if (file == NULL)
+ {
+ if (errno == ENOENT)
+ return; /* ignore not-found error */
+ goto error;
+ }
+
+ buffer_size = query_size;
+ buffer = (char *) palloc(buffer_size);
+
+ if (fread(&header, sizeof(uint32), 1, file) != 1 ||
+ header != PGSS_FILE_HEADER ||
+ fread(&num, sizeof(int32), 1, file) != 1)
+ goto error;
+
+ for (i = 0; i < num; i++)
+ {
+ pgssEntry temp;
+ pgssEntry *entry;
+
+ if (fread(&temp, offsetof(pgssEntry, mutex), 1, file) != 1)
+ goto error;
+
+ /* Encoding is the only field we can easily sanity-check */
+ if (!PG_VALID_BE_ENCODING(temp.key.encoding))
+ goto error;
+
+ /* Previous incarnation might have had a larger query_size */
+ if (temp.key.query_len >= buffer_size)
+ {
+ buffer = (char *) repalloc(buffer, temp.key.query_len + 1);
+ buffer_size = temp.key.query_len + 1;
+ }
+
+ if (fread(buffer, 1, temp.key.query_len, file) != temp.key.query_len)
+ goto error;
+ buffer[temp.key.query_len] = '\0';
+
+ /* Clip to available length if needed */
+ if (temp.key.query_len >= query_size)
+ temp.key.query_len = pg_encoding_mbcliplen(temp.key.encoding,
+ buffer,
+ temp.key.query_len,
+ query_size - 1);
+ temp.key.query_ptr = buffer;
+
+ /* make the hashtable entry (discards old entries if too many) */
+ entry = entry_alloc(&temp.key);
+
+ /* copy in the actual stats */
+ entry->counters = temp.counters;
+ }
+
+ pfree(buffer);
+ FreeFile(file);
+ return;
+
+error:
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not read pg_stat_statement file \"%s\": %m",
+ PGSS_DUMP_FILE)));
+ if (buffer)
+ pfree(buffer);
+ if (file)
+ FreeFile(file);
+ /* If possible, throw away the bogus file; ignore any error */
+ unlink(PGSS_DUMP_FILE);
+}
+
+/*
+ * shmem_shutdown hook: Dump statistics into file.
+ *
+ * Note: we don't bother with acquiring lock, because there should be no
+ * other processes running when this is called.
+ */
+static void
+pgss_shmem_shutdown(int code, Datum arg)
+{
+ FILE *file;
+ HASH_SEQ_STATUS hash_seq;
+ int32 num_entries;
+ pgssEntry *entry;
+
+ /* Don't try to dump during a crash. */
+ if (code)
+ return;
+
+ /* Safety check ... shouldn't get here unless shmem is set up. */
+ if (!pgss || !pgss_hash)
+ return;
+
+ /* Don't dump if told not to. */
+ if (!pgss_save)
+ return;
+
+ file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_W);
+ if (file == NULL)
+ goto error;
+
+ if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1)
+ goto error;
+ num_entries = hash_get_num_entries(pgss_hash);
+ if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
+ goto error;
+
+ hash_seq_init(&hash_seq, pgss_hash);
+ while ((entry = hash_seq_search(&hash_seq)) != NULL)
+ {
+ int len = entry->key.query_len;
+
+ if (fwrite(entry, offsetof(pgssEntry, mutex), 1, file) != 1 ||
+ fwrite(entry->query, 1, len, file) != len)
+ goto error;
+ }
+
+ if (FreeFile(file))
+ {
+ file = NULL;
+ goto error;
+ }
+
+ return;
+
+error:
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not write pg_stat_statement file \"%s\": %m",
+ PGSS_DUMP_FILE)));
+ if (file)
+ FreeFile(file);
+ unlink(PGSS_DUMP_FILE);
+}
+
+/*
+ * ExecutorStart hook: start up tracking if needed
+ */
+static void
+pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+ if (prev_ExecutorStart)
+ prev_ExecutorStart(queryDesc, eflags);
+ else
+ standard_ExecutorStart(queryDesc, eflags);
+
+ if (pgss_enabled())
+ {
+ /*
+ * Set up to track total elapsed time in ExecutorRun. Make sure
+ * the space is allocated in the per-query context so it will go
+ * away at ExecutorEnd.
+ */
+ if (queryDesc->totaltime == NULL)
+ {
+ MemoryContext oldcxt;
+
+ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+ queryDesc->totaltime = InstrAlloc(1);
+ MemoryContextSwitchTo(oldcxt);
+ }
+ }
+}
+
+/*
+ * ExecutorRun hook: all we need do is track nesting depth
+ */
+static void
+pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count)
+{
+ nested_level++;
+ PG_TRY();
+ {
+ if (prev_ExecutorRun)
+ prev_ExecutorRun(queryDesc, direction, count);
+ else
+ standard_ExecutorRun(queryDesc, direction, count);
+ nested_level--;
+ }
+ PG_CATCH();
+ {
+ nested_level--;
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+}
+
+/*
+ * ExecutorEnd hook: store results if needed
+ */
+static void
+pgss_ExecutorEnd(QueryDesc *queryDesc)
+{
+ if (queryDesc->totaltime && pgss_enabled())
+ {
+ /*
+ * Make sure stats accumulation is done. (Note: it's okay if
+ * several levels of hook all do this.)
+ */
+ InstrEndLoop(queryDesc->totaltime);
+
+ pgss_store(queryDesc->sourceText,
+ queryDesc->totaltime,
+ queryDesc->estate->es_processed);
+ }
+
+ if (prev_ExecutorEnd)
+ prev_ExecutorEnd(queryDesc);
+ else
+ standard_ExecutorEnd(queryDesc);
+}
+
+/*
+ * Calculate hash value for a key
+ */
+static uint32
+pgss_hash_fn(const void *key, Size keysize)
+{
+ const pgssHashKey *k = (const pgssHashKey *) key;
+
+ /* we don't bother to include encoding in the hash */
+ return hash_uint32((uint32) k->userid) ^
+ hash_uint32((uint32) k->dbid) ^
+ DatumGetUInt32(hash_any((const unsigned char *) k->query_ptr,
+ k->query_len));
+}
+
+/*
+ * Compare two keys - zero means match
+ */
+static int
+pgss_match_fn(const void *key1, const void *key2, Size keysize)
+{
+ const pgssHashKey *k1 = (const pgssHashKey *) key1;
+ const pgssHashKey *k2 = (const pgssHashKey *) key2;
+
+ if (k1->userid == k2->userid &&
+ k1->dbid == k2->dbid &&
+ k1->encoding == k2->encoding &&
+ k1->query_len == k2->query_len &&
+ memcmp(k1->query_ptr, k2->query_ptr, k1->query_len) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+/*
+ * Store some statistics for a statement.
+ */
+static void
+pgss_store(const char *query, const Instrumentation *instr, uint32 rows)
+{
+ pgssHashKey key;
+ double usage;
+ pgssEntry *entry;
+
+ Assert(query != NULL);
+
+ /* Safety check... */
+ if (!pgss || !pgss_hash)
+ return;
+
+ /* Set up key for hashtable search */
+ key.userid = GetUserId();
+ key.dbid = MyDatabaseId;
+ key.encoding = GetDatabaseEncoding();
+ key.query_len = strlen(query);
+ if (key.query_len >= pgss->query_size)
+ key.query_len = pg_encoding_mbcliplen(key.encoding,
+ query,
+ key.query_len,
+ pgss->query_size - 1);
+ key.query_ptr = query;
+
+ usage = USAGE_EXEC(duration);
+
+ /* Lookup the hash table entry with shared lock. */
+ LWLockAcquire(pgss->lock, LW_SHARED);
+
+ entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL);
+ if (!entry)
+ {
+ /* Must acquire exclusive lock to add a new entry. */
+ LWLockRelease(pgss->lock);
+ LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
+ entry = entry_alloc(&key);
+ }
+
+ /* Grab the spinlock while updating the counters. */
+ {
+ volatile pgssEntry *e = (volatile pgssEntry *) entry;
+
+ SpinLockAcquire(&e->mutex);
+ e->counters.calls += 1;
+ e->counters.total_time += instr->total;
+ e->counters.rows += rows;
+ e->counters.usage += usage;
+ SpinLockRelease(&e->mutex);
+ }
+
+ LWLockRelease(pgss->lock);
+}
+
+/*
+ * Reset all statement statistics.
+ */
+Datum
+pg_stat_statements_reset(PG_FUNCTION_ARGS)
+{
+ if (!pgss || !pgss_hash)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
+ entry_reset();
+ PG_RETURN_VOID();
+}
+
+#define PG_STAT_STATEMENTS_COLS 6
+
+/*
+ * Retrieve statement statistics.
+ */
+Datum
+pg_stat_statements(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ TupleDesc tupdesc;
+ Tuplestorestate *tupstore;
+ MemoryContext per_query_ctx;
+ MemoryContext oldcontext;
+ Oid userid = GetUserId();
+ bool is_superuser = superuser();
+ HASH_SEQ_STATUS hash_seq;
+ pgssEntry *entry;
+
+ if (!pgss || !pgss_hash)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
+
+ /* check to see if caller supports us returning a tuplestore */
+ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("set-valued function called in context that cannot accept a set")));
+ if (!(rsinfo->allowedModes & SFRM_Materialize))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("materialize mode required, but it is not " \
+ "allowed in this context")));
+
+ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+ oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+ tupdesc = CreateTemplateTupleDesc(PG_STAT_STATEMENTS_COLS, false);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "userid",
+ OIDOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "dbid",
+ OIDOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "query",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 4, "calls",
+ INT8OID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 5, "total_time",
+ FLOAT8OID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 6, "rows",
+ INT8OID, -1, 0);
+
+ tupstore = tuplestore_begin_heap(true, false, work_mem);
+ rsinfo->returnMode = SFRM_Materialize;
+ rsinfo->setResult = tupstore;
+ rsinfo->setDesc = tupdesc;
+
+ LWLockAcquire(pgss->lock, LW_SHARED);
+
+ hash_seq_init(&hash_seq, pgss_hash);
+ while ((entry = hash_seq_search(&hash_seq)) != NULL)
+ {
+ Datum values[PG_STAT_STATEMENTS_COLS];
+ bool nulls[PG_STAT_STATEMENTS_COLS];
+ int i = 0;
+ Counters tmp;
+
+ /* generate junk in short-term context */
+ MemoryContextSwitchTo(oldcontext);
+
+ memset(values, 0, sizeof(values));
+ memset(nulls, 0, sizeof(nulls));
+
+ values[i++] = ObjectIdGetDatum(entry->key.userid);
+ values[i++] = ObjectIdGetDatum(entry->key.dbid);
+
+ if (is_superuser || entry->key.userid == userid)
+ {
+ char *qstr;
+
+ qstr = (char *)
+ pg_do_encoding_conversion((unsigned char *) entry->query,
+ entry->key.query_len,
+ entry->key.encoding,
+ GetDatabaseEncoding());
+ values[i++] = CStringGetTextDatum(qstr);
+ if (qstr != entry->query)
+ pfree(qstr);
+ }
+ else
+ values[i++] = CStringGetTextDatum("<insufficient privilege>");
+
+ /* copy counters to a local variable to keep locking time short */
+ {
+ volatile pgssEntry *e = (volatile pgssEntry *) entry;
+
+ SpinLockAcquire(&e->mutex);
+ tmp = e->counters;
+ SpinLockRelease(&e->mutex);
+ }
+
+ values[i++] = Int64GetDatumFast(tmp.calls);
+ values[i++] = Float8GetDatumFast(tmp.total_time);
+ values[i++] = Int64GetDatumFast(tmp.rows);
+
+ Assert(i == PG_STAT_STATEMENTS_COLS);
+
+ /* switch to appropriate context while storing the tuple */
+ MemoryContextSwitchTo(per_query_ctx);
+ tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+ }
+
+ LWLockRelease(pgss->lock);
+
+ /* clean up and return the tuplestore */
+ tuplestore_donestoring(tupstore);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return (Datum) 0;
+}
+
+/*
+ * Estimate shared memory space needed.
+ */
+static Size
+pgss_memsize(void)
+{
+ Size size;
+ Size entrysize;
+
+ size = MAXALIGN(sizeof(pgssSharedState));
+ entrysize = offsetof(pgssEntry, query) + pgstat_track_activity_query_size;
+ size = add_size(size, hash_estimate_size(pgss_max, entrysize));
+
+ return size;
+}
+
+/*
+ * Allocate a new hashtable entry.
+ * caller must hold an exclusive lock on pgss->lock
+ *
+ * Note: despite needing exclusive lock, it's not an error for the target
+ * entry to already exist. This is because pgss_store releases and
+ * reacquires lock after failing to find a match; so someone else could
+ * have made the entry while we waited to get exclusive lock.
+ */
+static pgssEntry *
+entry_alloc(pgssHashKey *key)
+{
+ pgssEntry *entry;
+ bool found;
+
+ /* Caller must have clipped query properly */
+ Assert(key->query_len < pgss->query_size);
+
+ /* Make space if needed */
+ while (hash_get_num_entries(pgss_hash) >= pgss_max)
+ entry_dealloc();
+
+ /* Find or create an entry with desired hash code */
+ entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found);
+
+ if (!found)
+ {
+ /* New entry, initialize it */
+
+ /* dynahash tried to copy the key for us, but must fix query_ptr */
+ entry->key.query_ptr = entry->query;
+ /* reset the statistics */
+ memset(&entry->counters, 0, sizeof(Counters));
+ entry->counters.usage = USAGE_INIT;
+ /* re-initialize the mutex each time ... we assume no one using it */
+ SpinLockInit(&entry->mutex);
+ /* ... and don't forget the query text */
+ memcpy(entry->query, key->query_ptr, key->query_len);
+ entry->query[key->query_len] = '\0';
+ }
+
+ return entry;
+}
+
+/*
+ * qsort comparator for sorting into increasing usage order
+ */
+static int
+entry_cmp(const void *lhs, const void *rhs)
+{
+ double l_usage = (*(const pgssEntry **)lhs)->counters.usage;
+ double r_usage = (*(const pgssEntry **)rhs)->counters.usage;
+
+ if (l_usage < r_usage)
+ return -1;
+ else if (l_usage > r_usage)
+ return +1;
+ else
+ return 0;
+}
+
+/*
+ * Deallocate least used entries.
+ * Caller must hold an exclusive lock on pgss->lock.
+ */
+static void
+entry_dealloc(void)
+{
+ HASH_SEQ_STATUS hash_seq;
+ pgssEntry **entries;
+ pgssEntry *entry;
+ int nvictims;
+ int i;
+
+ /* Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them. */
+
+ entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *));
+
+ i = 0;
+ hash_seq_init(&hash_seq, pgss_hash);
+ while ((entry = hash_seq_search(&hash_seq)) != NULL)
+ {
+ entries[i++] = entry;
+ entry->counters.usage *= USAGE_DECREASE_FACTOR;
+ }
+
+ qsort(entries, i, sizeof(pgssEntry *), entry_cmp);
+ nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
+ nvictims = Min(nvictims, i);
+
+ for (i = 0; i < nvictims; i++)
+ {
+ hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL);
+ }
+
+ pfree(entries);
+}
+
+/*
+ * Release all entries.
+ */
+static void
+entry_reset(void)
+{
+ HASH_SEQ_STATUS hash_seq;
+ pgssEntry *entry;
+
+ LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
+
+ hash_seq_init(&hash_seq, pgss_hash);
+ while ((entry = hash_seq_search(&hash_seq)) != NULL)
+ {
+ hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
+ }
+
+ LWLockRelease(pgss->lock);
+}
--- /dev/null
+<!-- $PostgreSQL: pgsql/doc/src/sgml/pgstatstatements.sgml,v 1.1 2009/01/04 22:19:59 tgl Exp $ -->
+
+<sect1 id="pgstatstatements">
+ <title>pg_stat_statements</title>
+
+ <indexterm zone="pgstatstatements">
+ <primary>pg_stat_statements</primary>
+ </indexterm>
+
+ <para>
+ The <filename>pg_stat_statements</filename> module provides a means for
+ tracking execution statistics of all SQL statements executed by a server.
+ </para>
+
+ <para>
+ The module must be loaded by adding <literal>pg_stat_statements</> to
+ <xref linkend="guc-shared-preload-libraries"> in
+ <filename>postgresql.conf</>, because it requires additional shared memory.
+ This means that a server restart is needed to add or remove the module.
+ </para>
+
+ <sect2>
+ <title>The <structname>pg_stat_statements</structname> view</title>
+
+ <para>
+ The statistics gathered by the module are made available via a system view
+ named <structname>pg_stat_statements</>. This view contains one row for
+ each distinct query text, database ID, and user ID (up to the maximum
+ number of distinct statements that the module can track). The columns
+ of the view are:
+ </para>
+
+ <table>
+ <title><structname>pg_stat_statements</> columns</title>
+
+ <tgroup cols="4">
+ <thead>
+ <row>
+ <entry>Name</entry>
+ <entry>Type</entry>
+ <entry>References</entry>
+ <entry>Description</entry>
+ </row>
+ </thead>
+ <tbody>
+ <row>
+ <entry><structfield>userid</structfield></entry>
+ <entry><type>oid</type></entry>
+ <entry><literal><link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.oid</literal></entry>
+ <entry>OID of user who executed the statement</entry>
+ </row>
+
+ <row>
+ <entry><structfield>dbid</structfield></entry>
+ <entry><type>oid</type></entry>
+ <entry><literal><link linkend="catalog-pg-database"><structname>pg_database</structname></link>.oid</literal></entry>
+ <entry>OID of database in which the statement was executed</entry>
+ </row>
+
+ <row>
+ <entry><structfield>query</structfield></entry>
+ <entry><type>text</type></entry>
+ <entry></entry>
+ <entry>Text of the statement (up to <xref linkend="guc-track-activity-query-size"> bytes)</entry>
+ </row>
+
+ <row>
+ <entry><structfield>calls</structfield></entry>
+ <entry><type>bigint</type></entry>
+ <entry></entry>
+ <entry>Number of times executed</entry>
+ </row>
+
+ <row>
+ <entry><structfield>total_time</structfield></entry>
+ <entry><type>double precision</type></entry>
+ <entry></entry>
+ <entry>Total time spent in the statement, in seconds</entry>
+ </row>
+
+ <row>
+ <entry><structfield>rows</structfield></entry>
+ <entry><type>bigint</type></entry>
+ <entry></entry>
+ <entry>Total number of rows retrieved or affected by the statement</entry>
+ </row>
+
+ </tbody>
+ </tgroup>
+ </table>
+
+ <para>
+ This view, and the function <function>pg_stat_statements_reset</>,
+ are available only in databases they have been specifically installed into
+ by running the <filename>pg_stat_statements.sql</> install script.
+ However, statistics are tracked across all databases of the server
+ whenever the <filename>pg_stat_statements</filename> module is loaded
+ into the server, regardless of presence of the view.
+ </para>
+
+ <para>
+ For security reasons, non-superusers are not allowed to see the text of
+ queries executed by other users. They can see the statistics, however,
+ if the view has been installed in their database.
+ </para>
+
+ <para>
+ Note that statements are considered the same if they have the same text,
+ regardless of the values of any out-of-line parameters used in the
+ statement. Using out-of-line parameters will help to group statements
+ together and may make the statistics more useful.
+ </para>
+ </sect2>
+
+ <sect2>
+ <title>Functions</title>
+
+ <variablelist>
+ <varlistentry>
+ <term>
+ <function>pg_stat_statements_reset() returns void</function>
+ </term>
+
+ <listitem>
+ <para>
+ <function>pg_stat_statements_reset</function> discards all statistics
+ gathered so far by <filename>pg_stat_statements</>.
+ By default, this function can only be executed by superusers.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ </variablelist>
+ </sect2>
+
+ <sect2>
+ <title>Configuration parameters</title>
+
+ <variablelist>
+ <varlistentry>
+ <term>
+ <varname>pg_stat_statements.max</varname> (<type>integer</type>)
+ </term>
+
+ <listitem>
+ <para>
+ <varname>pg_stat_statements.max</varname> is the maximum number of
+ statements tracked by the module (i.e., the maximum number of rows
+ in the <structname>pg_stat_statements</> view). If more distinct
+ statements than that are observed, information about the least-executed
+ statements is discarded.
+ The default value is 1000.
+ This parameter can only be set at server start.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>
+ <varname>pg_stat_statements.track</varname> (<type>enum</type>)
+ </term>
+
+ <listitem>
+ <para>
+ <varname>pg_stat_statements.track</varname> controls which statements
+ are counted by the module.
+ Specify <literal>top</> to track top-level statements (those issued
+ directly by clients), <literal>all</> to also track nested statements
+ (such as statements invoked within functions), or <literal>none</> to
+ disable.
+ The default value is <literal>top</>.
+ Only superusers can change this setting.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>
+ <varname>pg_stat_statements.save</varname> (<type>boolean</type>)
+ </term>
+
+ <listitem>
+ <para>
+ <varname>pg_stat_statements.save</varname> specifies whether to
+ save statement statistics across server shutdowns.
+ If it is <literal>off</> then statistics are not saved at
+ shutdown nor reloaded at server start.
+ The default value is <literal>on</>.
+ This parameter can only be set in the <filename>postgresql.conf</>
+ file or on the server command line.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+
+ <para>
+ The module requires additional shared memory amounting to about
+ <varname>pg_stat_statements.max</varname> <literal>*</>
+ <xref linkend="guc-track-activity-query-size"> bytes. Note that this
+ memory is consumed whenever the module is loaded, even if
+ <varname>pg_stat_statements.track</> is set to <literal>none</>.
+ </para>
+
+ <para>
+ In order to set any of these parameters in your
+ <filename>postgresql.conf</> file,
+ you will need to add <literal>pg_stat_statements</> to
+ <xref linkend="guc-custom-variable-classes">. Typical usage might be:
+ </para>
+
+ <programlisting>
+# postgresql.conf
+shared_preload_libraries = 'pg_stat_statements'
+
+custom_variable_classes = 'pg_stat_statements'
+pg_stat_statements.max = 10000
+pg_stat_statements.track = all
+ </programlisting>
+ </sect2>
+
+ <sect2>
+ <title>Sample output</title>
+
+ <programlisting>
+$ pgbench -i bench
+
+postgres=# SELECT pg_stat_statements_reset();
+
+$ pgbench -c10 -t300 -M prepared bench
+
+postgres=# \x
+postgres=# SELECT * FROM pg_stat_statements ORDER BY total_time DESC LIMIT 3;
+-[ RECORD 1 ]------------------------------------------------------------
+userid | 10
+dbid | 63781
+query | UPDATE branches SET bbalance = bbalance + $1 WHERE bid = $2;
+calls | 3000
+total_time | 20.716706
+rows | 3000
+-[ RECORD 2 ]------------------------------------------------------------
+userid | 10
+dbid | 63781
+query | UPDATE tellers SET tbalance = tbalance + $1 WHERE tid = $2;
+calls | 3000
+total_time | 17.1107649999999
+rows | 3000
+-[ RECORD 3 ]------------------------------------------------------------
+userid | 10
+dbid | 63781
+query | UPDATE accounts SET abalance = abalance + $1 WHERE aid = $2;
+calls | 3000
+total_time | 0.645601
+rows | 3000
+ </programlisting>
+ </sect2>
+
+ <sect2>
+ <title>Author</title>
+
+ <para>
+ Takahiro Itagaki <email>itagaki.takahiro@oss.ntt.co.jp</email>
+ </para>
+ </sect2>
+
+</sect1>