From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 4 Jan 2009 22:19:59 +0000 (+0000)
Subject: Add contrib/pg_stat_statements for server-wide tracking of statement execution
X-Git-Tag: REL8_4_BETA1~482
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7466eeac61e2ce2af25d67d25d3ec60f0f0764da;p=postgresql

Add contrib/pg_stat_statements for server-wide tracking of statement execution
statistics.

Takahiro Itagaki
---

diff --git a/contrib/Makefile b/contrib/Makefile
index bbd43e1f66..4b1d2ae95d 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -1,4 +1,4 @@
-# $PostgreSQL: pgsql/contrib/Makefile,v 1.85 2008/11/19 02:59:28 tgl Exp $
+# $PostgreSQL: pgsql/contrib/Makefile,v 1.86 2009/01/04 22:19:59 tgl Exp $
 
 subdir = contrib
 top_builddir = ..
@@ -27,6 +27,7 @@ WANTED_DIRS = \
 		pg_buffercache	\
 		pg_freespacemap \
 		pg_standby	\
+		pg_stat_statements \
 		pg_trgm		\
 		pgbench		\
 		pgcrypto	\
diff --git a/contrib/README b/contrib/README
index 060853fa21..7d258d4b2f 100644
--- a/contrib/README
+++ b/contrib/README
@@ -112,6 +112,10 @@ pg_standby -
 	Sample archive_command for warm standby operation
 	by Simon Riggs <simon@2ndquadrant.com>
 
+pg_stat_statements -
+	Track statement execution times across a whole database cluster
+	by Takahiro Itagaki <itagaki.takahiro@oss.ntt.co.jp>
+
 pg_trgm -
 	Functions for determining the similarity of text based on trigram
 	matching.
diff --git a/contrib/pg_stat_statements/Makefile b/contrib/pg_stat_statements/Makefile
new file mode 100644
index 0000000000..ce335a656e
--- /dev/null
+++ b/contrib/pg_stat_statements/Makefile
@@ -0,0 +1,17 @@
+# $PostgreSQL: pgsql/contrib/pg_stat_statements/Makefile,v 1.1 2009/01/04 22:19:59 tgl Exp $
+
+MODULE_big = pg_stat_statements
+DATA_built = pg_stat_statements.sql
+DATA = uninstall_pg_stat_statements.sql
+OBJS = pg_stat_statements.o
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_stat_statements
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
new file mode 100644
index 0000000000..df14d0559b
--- /dev/null
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -0,0 +1,904 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_stat_statements.c
+ *		Track statement execution times across a whole database cluster.
+ *
+ * Note about locking issues: to create or delete an entry in the shared
+ * hashtable, one must hold pgss->lock exclusively.  Modifying any field
+ * in an entry except the counters requires the same.  To look up an entry,
+ * one must hold the lock shared.  To read or update the counters within
+ * an entry, one must hold the lock shared or exclusive (so the entry doesn't
+ * disappear!) and also take the entry's mutex spinlock.
+ *
+ *
+ * Copyright (c) 2008-2009, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/contrib/pg_stat_statements/pg_stat_statements.c,v 1.1 2009/01/04 22:19:59 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "access/hash.h"
+#include "catalog/pg_type.h"
+#include "executor/executor.h"
+#include "executor/instrument.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+#include "utils/hsearch.h"
+#include "utils/guc.h"
+
+
+PG_MODULE_MAGIC;
+
+/* Location of stats file */
+#define PGSS_DUMP_FILE	"global/pg_stat_statements.stat"
+
+/* This constant defines the magic number in the stats file header */
+static const uint32 PGSS_FILE_HEADER = 0x20081202;
+
+/* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
+#define USAGE_EXEC(duration)	(1.0)
+#define USAGE_INIT				(1.0)	/* including initial planning */
+#define USAGE_DECREASE_FACTOR	(0.99)	/* decreased every entry_dealloc */
+#define USAGE_DEALLOC_PERCENT	5		/* free this % of entries at once */
+
+/*
+ * Hashtable key that defines the identity of a hashtable entry.  The
+ * hash comparators do not assume that the query string is null-terminated;
+ * this lets us search for an mbcliplen'd string without copying it first.
+ *
+ * Presently, the query encoding is fully determined by the source database
+ * and so we don't really need it to be in the key.  But that might not always
+ * be true. Anyway it's notationally convenient to pass it as part of the key.
+ */
+typedef struct pgssHashKey
+{
+	Oid			userid;			/* user OID */
+	Oid			dbid;			/* database OID */
+	int			encoding;		/* query encoding */
+	int			query_len;		/* # of valid bytes in query string */
+	const char *query_ptr;		/* query string proper */
+} pgssHashKey;
+
+/*
+ * The actual stats counters kept within pgssEntry.
+ */
+typedef struct Counters
+{
+	int64		calls;			/* # of times executed */
+	double		total_time;		/* total execution time in seconds */
+	int64		rows;			/* total # of retrieved or affected rows */
+	double		usage;			/* usage factor */
+} Counters;
+
+/*
+ * Statistics per statement
+ *
+ * NB: see the file read/write code before changing field order here.
+ */
+typedef struct pgssEntry
+{
+	pgssHashKey	key;			/* hash key of entry - MUST BE FIRST */
+	Counters	counters;		/* the statistics for this query */
+	slock_t		mutex;			/* protects the counters only */
+	char		query[1];		/* VARIABLE LENGTH ARRAY - MUST BE LAST */
+	/* Note: the allocated length of query[] is actually pgss->query_size */
+} pgssEntry;
+
+/*
+ * Global shared state
+ */
+typedef struct pgssSharedState
+{
+	LWLockId	lock;			/* protects hashtable search/modification */
+	int			query_size;		/* max query length in bytes */
+} pgssSharedState;
+
+/*---- Local variables ----*/
+
+/* Current nesting depth of ExecutorRun calls */
+static int						nested_level = 0;
+/* Saved hook values in case of unload */
+static shmem_startup_hook_type	prev_shmem_startup_hook = NULL;
+static ExecutorStart_hook_type	prev_ExecutorStart = NULL;
+static ExecutorRun_hook_type	prev_ExecutorRun = NULL;
+static ExecutorEnd_hook_type	prev_ExecutorEnd = NULL;
+/* Links to shared memory state */
+static pgssSharedState		   *pgss = NULL;
+static HTAB					   *pgss_hash = NULL;
+
+/*---- GUC variables ----*/
+
+typedef enum
+{
+	PGSS_TRACK_NONE,			/* track no statements */
+	PGSS_TRACK_TOP,				/* only top level statements */
+	PGSS_TRACK_ALL,				/* all statements, including nested ones */
+} PGSSTrackLevel;
+
+static const struct config_enum_entry track_options[] = {
+	{"none", PGSS_TRACK_NONE, false},
+	{"top", PGSS_TRACK_TOP, false},
+	{"all", PGSS_TRACK_ALL, false},
+	{NULL, 0, false}
+};
+
+static int	pgss_max;			/* max # statements to track */
+static int	pgss_track;			/* tracking level */
+static bool pgss_save;			/* whether to save stats across shutdown */
+
+
+#define pgss_enabled() \
+	(pgss_track == PGSS_TRACK_ALL || \
+	(pgss_track == PGSS_TRACK_TOP && nested_level == 0))
+
+/*---- Function declarations ----*/
+
+void	_PG_init(void);
+void	_PG_fini(void);
+
+Datum	pg_stat_statements_reset(PG_FUNCTION_ARGS);
+Datum	pg_stat_statements(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(pg_stat_statements_reset);
+PG_FUNCTION_INFO_V1(pg_stat_statements);
+
+static void pgss_shmem_startup(void);
+static void pgss_shmem_shutdown(int code, Datum arg);
+static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
+static void pgss_ExecutorRun(QueryDesc *queryDesc,
+							 ScanDirection direction,
+							 long count);
+static void pgss_ExecutorEnd(QueryDesc *queryDesc);
+static uint32 pgss_hash_fn(const void *key, Size keysize);
+static int pgss_match_fn(const void *key1, const void *key2, Size keysize);
+static void pgss_store(const char *query,
+					   const Instrumentation *instr, uint32 rows);
+static Size	pgss_memsize(void);
+static pgssEntry *entry_alloc(pgssHashKey *key);
+static void entry_dealloc(void);
+static void entry_reset(void);
+
+
+/*
+ * Module load callback
+ */
+void
+_PG_init(void)
+{
+	/*
+	 * In order to create our shared memory area, we have to be loaded via
+	 * shared_preload_libraries.  If not, fall out without hooking into
+	 * any of the main system.  (We don't throw error here because it seems
+	 * useful to allow the pg_stat_statements functions to be created even
+	 * when the module isn't active.  The functions must protect themselves
+	 * against being called then, however.)
+	 */
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+
+	/*
+	 * Define (or redefine) custom GUC variables.
+	 */
+	DefineCustomIntVariable("pg_stat_statements.max",
+							"Sets the maximum number of statements tracked by pg_stat_statements.",
+							NULL,
+							&pgss_max,
+							1000,
+							100,
+							INT_MAX,
+							PGC_POSTMASTER,
+							0,
+							NULL,
+							NULL);
+
+	DefineCustomEnumVariable("pg_stat_statements.track",
+							 "Selects which statements are tracked by pg_stat_statements.",
+							 NULL,
+							 &pgss_track,
+							 PGSS_TRACK_TOP,
+							 track_options,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL);
+
+	DefineCustomBoolVariable("pg_stat_statements.save",
+							 "Save pg_stat_statements statistics across server shutdowns.",
+							 NULL,
+							 &pgss_save,
+							 true,
+							 PGC_SIGHUP,
+							 0,
+							 NULL,
+							 NULL);
+
+	/*
+	 * Request additional shared resources.  (These are no-ops if we're not in
+	 * the postmaster process.)  We'll allocate or attach to the shared
+	 * resources in pgss_shmem_startup().
+	 */
+	RequestAddinShmemSpace(pgss_memsize());
+	RequestAddinLWLocks(1);
+
+	/*
+	 * Install hooks.
+	 */
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = pgss_shmem_startup;
+	prev_ExecutorStart = ExecutorStart_hook;
+	ExecutorStart_hook = pgss_ExecutorStart;
+	prev_ExecutorRun = ExecutorRun_hook;
+	ExecutorRun_hook = pgss_ExecutorRun;
+	prev_ExecutorEnd = ExecutorEnd_hook;
+	ExecutorEnd_hook = pgss_ExecutorEnd;
+}
+
+/*
+ * Module unload callback
+ */
+void
+_PG_fini(void)
+{
+	/* Uninstall hooks. */
+	ExecutorStart_hook = prev_ExecutorStart;
+	ExecutorRun_hook = prev_ExecutorRun;
+	ExecutorEnd_hook = prev_ExecutorEnd;
+	shmem_startup_hook = prev_shmem_startup_hook;
+}
+
+/*
+ * shmem_startup hook: allocate or attach to shared memory,
+ * then load any pre-existing statistics from file.
+ */
+static void
+pgss_shmem_startup(void)
+{
+    bool		found;
+	HASHCTL		info;
+	FILE	   *file;
+	uint32		header;
+	int32		num;
+	int32		i;
+	int			query_size;
+	int			buffer_size;
+	char	   *buffer = NULL;
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	/* reset in case this is a restart within the postmaster */
+	pgss = NULL;
+	pgss_hash = NULL;
+
+	/*
+	 * Create or attach to the shared memory state, including hash table
+	 */
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	pgss = ShmemInitStruct("pg_stat_statements",
+						   sizeof(pgssSharedState),
+						   &found);
+	if (!pgss)
+		elog(ERROR, "out of shared memory");
+
+	if (!found)
+    {
+		/* First time through ... */
+		pgss->lock = LWLockAssign();
+		pgss->query_size = pgstat_track_activity_query_size;
+	}
+
+	/* Be sure everyone agrees on the hash table entry size */
+	query_size = pgss->query_size;
+
+	memset(&info, 0, sizeof(info));
+	info.keysize = sizeof(pgssHashKey);
+	info.entrysize = offsetof(pgssEntry, query) + query_size;
+	info.hash = pgss_hash_fn;
+	info.match = pgss_match_fn;
+	pgss_hash = ShmemInitHash("pg_stat_statements hash",
+							  pgss_max, pgss_max,
+							  &info,
+							  HASH_ELEM | HASH_FUNCTION | HASH_COMPARE);
+	if (!pgss_hash)
+		elog(ERROR, "out of shared memory");
+
+	LWLockRelease(AddinShmemInitLock);
+
+	/*
+	 * If we're in the postmaster (or a standalone backend...), set up a
+	 * shmem exit hook to dump the statistics to disk.
+	 */
+	if (!IsUnderPostmaster)
+		on_shmem_exit(pgss_shmem_shutdown, (Datum) 0);
+
+	/*
+	 * Attempt to load old statistics from the dump file.
+	 *
+	 * Note: we don't bother with locks here, because there should be no
+	 * other processes running when this is called.
+	 */
+	if (!pgss_save)
+		return;
+
+	file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R);
+	if (file == NULL)
+	{
+		if (errno == ENOENT)
+			return;				/* ignore not-found error */
+		goto error;
+	}
+
+	buffer_size = query_size;
+	buffer = (char *) palloc(buffer_size);
+
+	if (fread(&header, sizeof(uint32), 1, file) != 1 ||
+		header != PGSS_FILE_HEADER ||
+		fread(&num, sizeof(int32), 1, file) != 1)
+		goto error;
+
+	for (i = 0; i < num; i++)
+	{
+		pgssEntry	temp;
+		pgssEntry   *entry;
+
+		if (fread(&temp, offsetof(pgssEntry, mutex), 1, file) != 1)
+			goto error;
+
+		/* Encoding is the only field we can easily sanity-check */
+		if (!PG_VALID_BE_ENCODING(temp.key.encoding))
+			goto error;
+
+		/* Previous incarnation might have had a larger query_size */
+		if (temp.key.query_len >= buffer_size)
+		{
+			buffer = (char *) repalloc(buffer, temp.key.query_len + 1);
+			buffer_size = temp.key.query_len + 1;
+		}
+
+		if (fread(buffer, 1, temp.key.query_len, file) != temp.key.query_len)
+			goto error;
+		buffer[temp.key.query_len] = '\0';
+
+		/* Clip to available length if needed */
+		if (temp.key.query_len >= query_size)
+			temp.key.query_len = pg_encoding_mbcliplen(temp.key.encoding,
+													   buffer,
+													   temp.key.query_len,
+													   query_size - 1);
+		temp.key.query_ptr = buffer;
+
+		/* make the hashtable entry (discards old entries if too many) */
+		entry = entry_alloc(&temp.key);
+
+		/* copy in the actual stats */
+		entry->counters = temp.counters;
+	}
+
+	pfree(buffer);
+	FreeFile(file);
+	return;
+
+error:
+	ereport(LOG,
+			(errcode_for_file_access(),
+			 errmsg("could not read pg_stat_statement file \"%s\": %m",
+					PGSS_DUMP_FILE)));
+	if (buffer)
+		pfree(buffer);
+	if (file)
+		FreeFile(file);
+	/* If possible, throw away the bogus file; ignore any error */
+	unlink(PGSS_DUMP_FILE);
+}
+
+/*
+ * shmem_shutdown hook: Dump statistics into file.
+ *
+ * Note: we don't bother with acquiring lock, because there should be no
+ * other processes running when this is called.
+ */
+static void
+pgss_shmem_shutdown(int code, Datum arg)
+{
+	FILE			   *file;
+	HASH_SEQ_STATUS		hash_seq;
+	int32				num_entries;
+	pgssEntry		   *entry;
+
+	/* Don't try to dump during a crash. */
+	if (code)
+		return;
+
+	/* Safety check ... shouldn't get here unless shmem is set up. */
+	if (!pgss || !pgss_hash)
+		return;
+
+	/* Don't dump if told not to. */
+	if (!pgss_save)
+		return;
+
+	file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_W);
+	if (file == NULL)
+		goto error;
+
+	if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1)
+		goto error;
+	num_entries = hash_get_num_entries(pgss_hash);
+	if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
+		goto error;
+
+	hash_seq_init(&hash_seq, pgss_hash);
+	while ((entry = hash_seq_search(&hash_seq)) != NULL)
+	{
+		int		len = entry->key.query_len;
+
+		if (fwrite(entry, offsetof(pgssEntry, mutex), 1, file) != 1 ||
+			fwrite(entry->query, 1, len, file) != len)
+			goto error;
+	}
+
+	if (FreeFile(file))
+	{
+		file = NULL;
+		goto error;
+	}
+
+	return;
+
+error:
+	ereport(LOG,
+			(errcode_for_file_access(),
+			 errmsg("could not write pg_stat_statement file \"%s\": %m",
+					PGSS_DUMP_FILE)));
+	if (file)
+		FreeFile(file);
+	unlink(PGSS_DUMP_FILE);
+}
+
+/*
+ * ExecutorStart hook: start up tracking if needed
+ */
+static void
+pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+	if (prev_ExecutorStart)
+		prev_ExecutorStart(queryDesc, eflags);
+	else
+		standard_ExecutorStart(queryDesc, eflags);
+
+	if (pgss_enabled())
+	{
+		/*
+		 * Set up to track total elapsed time in ExecutorRun.  Make sure
+		 * the space is allocated in the per-query context so it will go
+		 * away at ExecutorEnd.
+		 */
+		if (queryDesc->totaltime == NULL)
+		{
+			MemoryContext oldcxt;
+
+			oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+			queryDesc->totaltime = InstrAlloc(1);
+			MemoryContextSwitchTo(oldcxt);
+		}
+	}
+}
+
+/*
+ * ExecutorRun hook: all we need do is track nesting depth
+ */
+static void
+pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count)
+{
+	nested_level++;
+	PG_TRY();
+	{
+		if (prev_ExecutorRun)
+			prev_ExecutorRun(queryDesc, direction, count);
+		else
+			standard_ExecutorRun(queryDesc, direction, count);
+		nested_level--;
+	}
+	PG_CATCH();
+	{
+		nested_level--;
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+}
+
+/*
+ * ExecutorEnd hook: store results if needed
+ */
+static void
+pgss_ExecutorEnd(QueryDesc *queryDesc)
+{
+	if (queryDesc->totaltime && pgss_enabled())
+	{
+		/*
+		 * Make sure stats accumulation is done.  (Note: it's okay if
+		 * several levels of hook all do this.)
+		 */
+		InstrEndLoop(queryDesc->totaltime);
+
+		pgss_store(queryDesc->sourceText,
+				   queryDesc->totaltime,
+				   queryDesc->estate->es_processed);
+	}
+
+	if (prev_ExecutorEnd)
+		prev_ExecutorEnd(queryDesc);
+	else
+		standard_ExecutorEnd(queryDesc);
+}
+
+/*
+ * Calculate hash value for a key
+ */
+static uint32
+pgss_hash_fn(const void *key, Size keysize)
+{
+	const pgssHashKey *k = (const pgssHashKey *) key;
+
+	/* we don't bother to include encoding in the hash */
+	return hash_uint32((uint32) k->userid) ^
+		hash_uint32((uint32) k->dbid) ^
+		DatumGetUInt32(hash_any((const unsigned char *) k->query_ptr,
+								k->query_len));
+}
+
+/*
+ * Compare two keys - zero means match
+ */
+static int
+pgss_match_fn(const void *key1, const void *key2, Size keysize)
+{
+	const pgssHashKey *k1 = (const pgssHashKey *) key1;
+	const pgssHashKey *k2 = (const pgssHashKey *) key2;
+
+	if (k1->userid == k2->userid &&
+		k1->dbid == k2->dbid &&
+		k1->encoding == k2->encoding &&
+		k1->query_len == k2->query_len &&
+		memcmp(k1->query_ptr, k2->query_ptr, k1->query_len) == 0)
+		return 0;
+	else
+		return 1;
+}
+
+/*
+ * Store some statistics for a statement.
+ */
+static void
+pgss_store(const char *query, const Instrumentation *instr, uint32 rows)
+{
+	pgssHashKey	key;
+	double		usage;
+	pgssEntry   *entry;
+
+	Assert(query != NULL);
+
+	/* Safety check... */
+	if (!pgss || !pgss_hash)
+		return;
+
+	/* Set up key for hashtable search */
+	key.userid = GetUserId();
+	key.dbid = MyDatabaseId;
+	key.encoding = GetDatabaseEncoding();
+	key.query_len = strlen(query);
+	if (key.query_len >= pgss->query_size)
+		key.query_len = pg_encoding_mbcliplen(key.encoding,
+											  query,
+											  key.query_len,
+											  pgss->query_size - 1);
+	key.query_ptr = query;
+
+	usage = USAGE_EXEC(duration);
+
+	/* Lookup the hash table entry with shared lock. */
+	LWLockAcquire(pgss->lock, LW_SHARED);
+
+	entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL);
+	if (!entry)
+	{
+		/* Must acquire exclusive lock to add a new entry. */
+		LWLockRelease(pgss->lock);
+		LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
+		entry = entry_alloc(&key);
+	}
+
+	/* Grab the spinlock while updating the counters. */
+	{
+		volatile pgssEntry *e = (volatile pgssEntry *) entry;
+
+		SpinLockAcquire(&e->mutex);
+		e->counters.calls += 1;
+		e->counters.total_time += instr->total;
+		e->counters.rows += rows;
+		e->counters.usage += usage;
+		SpinLockRelease(&e->mutex);
+	}
+
+	LWLockRelease(pgss->lock);
+}
+
+/*
+ * Reset all statement statistics.
+ */
+Datum
+pg_stat_statements_reset(PG_FUNCTION_ARGS)
+{
+	if (!pgss || !pgss_hash)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
+	entry_reset();
+	PG_RETURN_VOID();
+}
+
+#define PG_STAT_STATEMENTS_COLS		6
+
+/*
+ * Retrieve statement statistics.
+ */
+Datum
+pg_stat_statements(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo	   *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc			tupdesc;
+	Tuplestorestate    *tupstore;
+	MemoryContext		per_query_ctx;
+	MemoryContext		oldcontext;
+	Oid					userid = GetUserId();
+	bool				is_superuser = superuser();
+	HASH_SEQ_STATUS		hash_seq;
+	pgssEntry		   *entry;
+
+	if (!pgss || !pgss_hash)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupdesc = CreateTemplateTupleDesc(PG_STAT_STATEMENTS_COLS, false);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "userid",
+					   OIDOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "dbid",
+					   OIDOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "query",
+					   TEXTOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 4, "calls",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 5, "total_time",
+					   FLOAT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 6, "rows",
+					   INT8OID, -1, 0);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	LWLockAcquire(pgss->lock, LW_SHARED);
+
+	hash_seq_init(&hash_seq, pgss_hash);
+	while ((entry = hash_seq_search(&hash_seq)) != NULL)
+	{
+		Datum		values[PG_STAT_STATEMENTS_COLS];
+		bool		nulls[PG_STAT_STATEMENTS_COLS];
+		int			i = 0;
+		Counters	tmp;
+
+		/* generate junk in short-term context */
+		MemoryContextSwitchTo(oldcontext);
+
+		memset(values, 0, sizeof(values));
+		memset(nulls, 0, sizeof(nulls));
+
+		values[i++] = ObjectIdGetDatum(entry->key.userid);
+		values[i++] = ObjectIdGetDatum(entry->key.dbid);
+
+		if (is_superuser || entry->key.userid == userid)
+		{
+			char   *qstr;
+
+			qstr = (char *)
+				pg_do_encoding_conversion((unsigned char *) entry->query,
+										  entry->key.query_len,
+										  entry->key.encoding,
+										  GetDatabaseEncoding());
+			values[i++] = CStringGetTextDatum(qstr);
+			if (qstr != entry->query)
+				pfree(qstr);
+		}
+		else
+			values[i++] = CStringGetTextDatum("<insufficient privilege>");
+
+		/* copy counters to a local variable to keep locking time short */
+		{
+			volatile pgssEntry *e = (volatile pgssEntry *) entry;
+
+			SpinLockAcquire(&e->mutex);
+			tmp = e->counters;
+			SpinLockRelease(&e->mutex);
+		}
+
+		values[i++] = Int64GetDatumFast(tmp.calls);
+		values[i++] = Float8GetDatumFast(tmp.total_time);
+		values[i++] = Int64GetDatumFast(tmp.rows);
+
+		Assert(i == PG_STAT_STATEMENTS_COLS);
+
+		/* switch to appropriate context while storing the tuple */
+		MemoryContextSwitchTo(per_query_ctx);
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	LWLockRelease(pgss->lock);
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return (Datum) 0;
+}
+
+/*
+ * Estimate shared memory space needed.
+ */
+static Size
+pgss_memsize(void)
+{
+	Size	size;
+	Size	entrysize;
+
+	size = MAXALIGN(sizeof(pgssSharedState));
+	entrysize = offsetof(pgssEntry, query) + pgstat_track_activity_query_size;
+	size = add_size(size, hash_estimate_size(pgss_max, entrysize));
+
+	return size;
+}
+
+/*
+ * Allocate a new hashtable entry.
+ * caller must hold an exclusive lock on pgss->lock
+ *
+ * Note: despite needing exclusive lock, it's not an error for the target
+ * entry to already exist.  This is because pgss_store releases and
+ * reacquires lock after failing to find a match; so someone else could
+ * have made the entry while we waited to get exclusive lock.
+ */
+static pgssEntry *
+entry_alloc(pgssHashKey *key)
+{
+	pgssEntry  *entry;
+	bool	found;
+
+	/* Caller must have clipped query properly */
+	Assert(key->query_len < pgss->query_size);
+
+	/* Make space if needed */
+	while (hash_get_num_entries(pgss_hash) >= pgss_max)
+		entry_dealloc();
+
+	/* Find or create an entry with desired hash code */
+	entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found);
+
+	if (!found)
+	{
+		/* New entry, initialize it */
+
+		/* dynahash tried to copy the key for us, but must fix query_ptr */
+		entry->key.query_ptr = entry->query;
+		/* reset the statistics */
+		memset(&entry->counters, 0, sizeof(Counters));
+		entry->counters.usage = USAGE_INIT;
+		/* re-initialize the mutex each time ... we assume no one using it */
+		SpinLockInit(&entry->mutex);
+		/* ... and don't forget the query text */
+		memcpy(entry->query, key->query_ptr, key->query_len);
+		entry->query[key->query_len] = '\0';
+	}
+
+	return entry;
+}
+
+/*
+ * qsort comparator for sorting into increasing usage order
+ */
+static int
+entry_cmp(const void *lhs, const void *rhs)
+{
+	double	l_usage = (*(const pgssEntry **)lhs)->counters.usage;
+	double	r_usage = (*(const pgssEntry **)rhs)->counters.usage;
+
+	if (l_usage < r_usage)
+		return -1;
+	else if (l_usage > r_usage)
+		return +1;
+	else
+		return 0;
+}
+
+/*
+ * Deallocate least used entries.
+ * Caller must hold an exclusive lock on pgss->lock.
+ */
+static void
+entry_dealloc(void)
+{
+	HASH_SEQ_STATUS		hash_seq;
+	pgssEntry		  **entries;
+	pgssEntry		   *entry;
+	int					nvictims;
+	int					i;
+
+	/* Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them. */
+
+	entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *));
+
+	i = 0;
+	hash_seq_init(&hash_seq, pgss_hash);
+	while ((entry = hash_seq_search(&hash_seq)) != NULL)
+	{
+		entries[i++] = entry;
+		entry->counters.usage *= USAGE_DECREASE_FACTOR;
+	}
+
+	qsort(entries, i, sizeof(pgssEntry *), entry_cmp);
+	nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
+	nvictims = Min(nvictims, i);
+
+	for (i = 0; i < nvictims; i++)
+	{
+		hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL);
+	}
+
+	pfree(entries);
+}
+
+/*
+ * Release all entries.
+ */
+static void
+entry_reset(void)
+{
+	HASH_SEQ_STATUS		hash_seq;
+	pgssEntry		   *entry;
+
+	LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
+
+	hash_seq_init(&hash_seq, pgss_hash);
+	while ((entry = hash_seq_search(&hash_seq)) != NULL)
+	{
+		hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
+	}
+
+	LWLockRelease(pgss->lock);
+}
diff --git a/contrib/pg_stat_statements/pg_stat_statements.sql.in b/contrib/pg_stat_statements/pg_stat_statements.sql.in
new file mode 100644
index 0000000000..7655136ed9
--- /dev/null
+++ b/contrib/pg_stat_statements/pg_stat_statements.sql.in
@@ -0,0 +1,31 @@
+/* $PostgreSQL: pgsql/contrib/pg_stat_statements/pg_stat_statements.sql.in,v 1.1 2009/01/04 22:19:59 tgl Exp $ */
+
+-- Adjust this setting to control where the objects get created.
+SET search_path = public;
+
+-- Register functions.
+CREATE FUNCTION pg_stat_statements_reset()
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pg_stat_statements(
+    OUT userid oid,
+    OUT dbid oid,
+    OUT query text,
+    OUT calls int8,
+    OUT total_time float8,
+    OUT rows int8
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+-- Register a view on the function for ease of use.
+CREATE VIEW pg_stat_statements AS
+  SELECT * FROM pg_stat_statements();
+
+GRANT SELECT ON pg_stat_statements TO PUBLIC;
+
+-- Don't want this to be available to non-superusers.
+REVOKE ALL ON FUNCTION pg_stat_statements_reset() FROM PUBLIC;
diff --git a/contrib/pg_stat_statements/uninstall_pg_stat_statements.sql b/contrib/pg_stat_statements/uninstall_pg_stat_statements.sql
new file mode 100644
index 0000000000..31fd0af39d
--- /dev/null
+++ b/contrib/pg_stat_statements/uninstall_pg_stat_statements.sql
@@ -0,0 +1,8 @@
+/* $PostgreSQL: pgsql/contrib/pg_stat_statements/uninstall_pg_stat_statements.sql,v 1.1 2009/01/04 22:19:59 tgl Exp $ */
+
+-- Adjust this setting to control where the objects get dropped.
+SET search_path = public;
+
+DROP VIEW pg_stat_statements;
+DROP FUNCTION pg_stat_statements();
+DROP FUNCTION pg_stat_statements_reset();
diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml
index ecc5a0b23e..89fb5314fe 100644
--- a/doc/src/sgml/contrib.sgml
+++ b/doc/src/sgml/contrib.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.10 2008/11/19 02:59:28 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.11 2009/01/04 22:19:59 tgl Exp $ -->
 
 <appendix id="contrib">
  <title>Additional Supplied Modules</title>
@@ -103,6 +103,7 @@ psql -d dbname -f <replaceable>SHAREDIR</>/contrib/<replaceable>module</>.sql
  &pgfreespacemap;
  &pgrowlocks;
  &pgstandby;
+ &pgstatstatements;
  &pgstattuple;
  &pgtrgm;
  &seg;
diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml
index ea1c7c274f..273d5a0979 100644
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.58 2008/11/19 02:59:28 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.59 2009/01/04 22:19:59 tgl Exp $ -->
 
 <!entity history    SYSTEM "history.sgml">
 <!entity info       SYSTEM "info.sgml">
@@ -116,6 +116,7 @@
 <!entity pgfreespacemap  SYSTEM "pgfreespacemap.sgml">
 <!entity pgrowlocks      SYSTEM "pgrowlocks.sgml">
 <!entity pgstandby       SYSTEM "pgstandby.sgml">
+<!entity pgstatstatements SYSTEM "pgstatstatements.sgml">
 <!entity pgstattuple     SYSTEM "pgstattuple.sgml">
 <!entity pgtrgm          SYSTEM "pgtrgm.sgml">
 <!entity seg             SYSTEM "seg.sgml">
diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml
new file mode 100644
index 0000000000..93cda9f406
--- /dev/null
+++ b/doc/src/sgml/pgstatstatements.sgml
@@ -0,0 +1,265 @@
+<!-- $PostgreSQL: pgsql/doc/src/sgml/pgstatstatements.sgml,v 1.1 2009/01/04 22:19:59 tgl Exp $ -->
+
+<sect1 id="pgstatstatements">
+ <title>pg_stat_statements</title>
+
+ <indexterm zone="pgstatstatements">
+  <primary>pg_stat_statements</primary>
+ </indexterm>
+
+ <para>
+  The <filename>pg_stat_statements</filename> module provides a means for
+  tracking execution statistics of all SQL statements executed by a server.
+ </para>
+
+ <para>
+  The module must be loaded by adding <literal>pg_stat_statements</> to
+  <xref linkend="guc-shared-preload-libraries"> in
+  <filename>postgresql.conf</>, because it requires additional shared memory.
+  This means that a server restart is needed to add or remove the module.
+ </para>
+
+ <sect2>
+  <title>The <structname>pg_stat_statements</structname> view</title>
+
+  <para>
+   The statistics gathered by the module are made available via a system view
+   named <structname>pg_stat_statements</>.  This view contains one row for
+   each distinct query text, database ID, and user ID (up to the maximum
+   number of distinct statements that the module can track).  The columns
+   of the view are:
+  </para>
+
+  <table>
+   <title><structname>pg_stat_statements</> columns</title>
+
+   <tgroup cols="4">
+    <thead>
+     <row>
+      <entry>Name</entry>
+      <entry>Type</entry>
+      <entry>References</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+    <tbody>
+     <row>
+      <entry><structfield>userid</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.oid</literal></entry>
+      <entry>OID of user who executed the statement</entry>
+     </row>
+
+     <row>
+      <entry><structfield>dbid</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-database"><structname>pg_database</structname></link>.oid</literal></entry>
+      <entry>OID of database in which the statement was executed</entry>
+     </row>
+
+    <row>
+      <entry><structfield>query</structfield></entry>
+      <entry><type>text</type></entry>
+      <entry></entry>
+      <entry>Text of the statement (up to <xref linkend="guc-track-activity-query-size"> bytes)</entry>
+     </row>
+
+     <row>
+      <entry><structfield>calls</structfield></entry>
+      <entry><type>bigint</type></entry>
+      <entry></entry>
+      <entry>Number of times executed</entry>
+     </row>
+
+     <row>
+      <entry><structfield>total_time</structfield></entry>
+      <entry><type>double precision</type></entry>
+      <entry></entry>
+      <entry>Total time spent in the statement, in seconds</entry>
+     </row>
+
+     <row>
+      <entry><structfield>rows</structfield></entry>
+      <entry><type>bigint</type></entry>
+      <entry></entry>
+      <entry>Total number of rows retrieved or affected by the statement</entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   This view, and the function <function>pg_stat_statements_reset</>,
+   are available only in databases they have been specifically installed into
+   by running the <filename>pg_stat_statements.sql</> install script.
+   However, statistics are tracked across all databases of the server
+   whenever the <filename>pg_stat_statements</filename> module is loaded
+   into the server, regardless of presence of the view.
+  </para>
+
+  <para>
+   For security reasons, non-superusers are not allowed to see the text of
+   queries executed by other users.  They can see the statistics, however,
+   if the view has been installed in their database.
+  </para>
+
+  <para>
+   Note that statements are considered the same if they have the same text,
+   regardless of the values of any out-of-line parameters used in the
+   statement.  Using out-of-line parameters will help to group statements
+   together and may make the statistics more useful.
+  </para>
+ </sect2>
+
+ <sect2>
+  <title>Functions</title>
+
+  <variablelist>
+   <varlistentry>
+    <term>
+     <function>pg_stat_statements_reset() returns void</function>
+    </term>
+
+    <listitem>
+     <para>
+      <function>pg_stat_statements_reset</function> discards all statistics
+      gathered so far by <filename>pg_stat_statements</>.
+      By default, this function can only be executed by superusers.
+     </para>
+    </listitem>
+   </varlistentry>
+
+  </variablelist>
+ </sect2>
+
+ <sect2>
+  <title>Configuration parameters</title>
+
+  <variablelist>
+   <varlistentry>
+    <term>
+     <varname>pg_stat_statements.max</varname> (<type>integer</type>)
+    </term>
+
+    <listitem>
+     <para>
+      <varname>pg_stat_statements.max</varname> is the maximum number of
+      statements tracked by the module (i.e., the maximum number of rows
+      in the <structname>pg_stat_statements</> view).  If more distinct
+      statements than that are observed, information about the least-executed
+      statements is discarded.
+      The default value is 1000.
+      This parameter can only be set at server start.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term>
+     <varname>pg_stat_statements.track</varname> (<type>enum</type>)
+    </term>
+
+    <listitem>
+     <para>
+      <varname>pg_stat_statements.track</varname> controls which statements
+      are counted by the module.
+      Specify <literal>top</> to track top-level statements (those issued
+      directly by clients), <literal>all</> to also track nested statements
+      (such as statements invoked within functions), or <literal>none</> to
+      disable.
+      The default value is <literal>top</>.
+      Only superusers can change this setting.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term>
+     <varname>pg_stat_statements.save</varname> (<type>boolean</type>)
+    </term>
+
+    <listitem>
+     <para>
+      <varname>pg_stat_statements.save</varname> specifies whether to
+      save statement statistics across server shutdowns.
+      If it is <literal>off</> then statistics are not saved at
+      shutdown nor reloaded at server start.
+      The default value is <literal>on</>.
+      This parameter can only be set in the <filename>postgresql.conf</>
+      file or on the server command line.
+     </para>
+    </listitem>
+   </varlistentry>
+  </variablelist>
+
+  <para>
+   The module requires additional shared memory amounting to about
+   <varname>pg_stat_statements.max</varname> <literal>*</>
+   <xref linkend="guc-track-activity-query-size"> bytes.  Note that this
+   memory is consumed whenever the module is loaded, even if
+   <varname>pg_stat_statements.track</> is set to <literal>none</>.
+  </para>
+
+  <para>
+   In order to set any of these parameters in your
+   <filename>postgresql.conf</> file,
+   you will need to add <literal>pg_stat_statements</> to
+   <xref linkend="guc-custom-variable-classes">.  Typical usage might be:
+  </para>
+
+  <programlisting>
+# postgresql.conf
+shared_preload_libraries = 'pg_stat_statements'
+
+custom_variable_classes = 'pg_stat_statements'
+pg_stat_statements.max = 10000
+pg_stat_statements.track = all
+  </programlisting>
+ </sect2>
+
+ <sect2>
+  <title>Sample output</title>
+
+  <programlisting>
+$ pgbench -i bench
+
+postgres=# SELECT pg_stat_statements_reset();
+
+$ pgbench -c10 -t300 -M prepared bench
+
+postgres=# \x
+postgres=# SELECT * FROM pg_stat_statements ORDER BY total_time DESC LIMIT 3;
+-[ RECORD 1 ]------------------------------------------------------------
+userid     | 10
+dbid       | 63781
+query      | UPDATE branches SET bbalance = bbalance + $1 WHERE bid = $2;
+calls      | 3000
+total_time | 20.716706
+rows       | 3000
+-[ RECORD 2 ]------------------------------------------------------------
+userid     | 10
+dbid       | 63781
+query      | UPDATE tellers SET tbalance = tbalance + $1 WHERE tid = $2;
+calls      | 3000
+total_time | 17.1107649999999
+rows       | 3000
+-[ RECORD 3 ]------------------------------------------------------------
+userid     | 10
+dbid       | 63781
+query      | UPDATE accounts SET abalance = abalance + $1 WHERE aid = $2;
+calls      | 3000
+total_time | 0.645601
+rows       | 3000
+  </programlisting>
+ </sect2>
+
+ <sect2>
+  <title>Author</title>
+
+  <para>
+   Takahiro Itagaki <email>itagaki.takahiro@oss.ntt.co.jp</email>
+  </para>
+ </sect2>
+
+</sect1>
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 22bc5b1f3f..0d0d23f53a 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.569 2009/01/03 17:08:38 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.570 2009/01/04 22:19:59 tgl Exp $
  *
  * NOTES
  *
@@ -2731,7 +2731,7 @@ PostmasterStateMachine(void)
 		ereport(LOG,
 				(errmsg("all server processes terminated; reinitializing")));
 
-		shmem_exit(0);
+		shmem_exit(1);
 		reset_shared(PostPortNumber);
 
 		StartupPID = StartupDataBase();
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 5bb4cc597a..169821b79e 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -5,7 +5,7 @@
  *
  *	Copyright (c) 2001-2009, PostgreSQL Global Development Group
  *
- *	$PostgreSQL: pgsql/src/include/pgstat.h,v 1.81 2009/01/01 17:23:55 momjian Exp $
+ *	$PostgreSQL: pgsql/src/include/pgstat.h,v 1.82 2009/01/04 22:19:59 tgl Exp $
  * ----------
  */
 #ifndef PGSTAT_H
@@ -592,7 +592,7 @@ typedef struct PgStat_FunctionCallUsage
 extern bool pgstat_track_activities;
 extern bool pgstat_track_counts;
 extern int	pgstat_track_functions;
-extern int	pgstat_track_activity_query_size;
+extern PGDLLIMPORT int	pgstat_track_activity_query_size;
 extern char *pgstat_stat_tmpname;
 extern char *pgstat_stat_filename;