]> granicus.if.org Git - postgresql/blob - src/backend/postmaster/pgstat.c
Update copyright for 2009.
[postgresql] / src / backend / postmaster / pgstat.c
1 /* ----------
2  * pgstat.c
3  *
4  *      All the statistics collector stuff hacked up in one big, ugly file.
5  *
6  *      TODO:   - Separate collector, postmaster and backend stuff
7  *                        into different files.
8  *
9  *                      - Add some automatic call for pgstat vacuuming.
10  *
11  *                      - Add a pgstat config column to pg_database, so this
12  *                        entire thing can be enabled/disabled on a per db basis.
13  *
14  *      Copyright (c) 2001-2009, PostgreSQL Global Development Group
15  *
16  *      $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.187 2009/01/01 17:23:46 momjian Exp $
17  * ----------
18  */
19 #include "postgres.h"
20
21 #include <unistd.h>
22 #include <fcntl.h>
23 #include <sys/param.h>
24 #include <sys/time.h>
25 #include <sys/socket.h>
26 #include <netdb.h>
27 #include <netinet/in.h>
28 #include <arpa/inet.h>
29 #include <signal.h>
30 #include <time.h>
31 #ifdef HAVE_POLL_H
32 #include <poll.h>
33 #endif
34 #ifdef HAVE_SYS_POLL_H
35 #include <sys/poll.h>
36 #endif
37
38 #include "pgstat.h"
39
40 #include "access/heapam.h"
41 #include "access/transam.h"
42 #include "access/twophase_rmgr.h"
43 #include "access/xact.h"
44 #include "catalog/pg_database.h"
45 #include "catalog/pg_proc.h"
46 #include "libpq/ip.h"
47 #include "libpq/libpq.h"
48 #include "libpq/pqsignal.h"
49 #include "mb/pg_wchar.h"
50 #include "miscadmin.h"
51 #include "pg_trace.h"
52 #include "postmaster/autovacuum.h"
53 #include "postmaster/fork_process.h"
54 #include "postmaster/postmaster.h"
55 #include "storage/backendid.h"
56 #include "storage/fd.h"
57 #include "storage/ipc.h"
58 #include "storage/pg_shmem.h"
59 #include "storage/pmsignal.h"
60 #include "utils/guc.h"
61 #include "utils/memutils.h"
62 #include "utils/ps_status.h"
63 #include "utils/rel.h"
64 #include "utils/tqual.h"
65
66
67 /* ----------
68  * Paths for the statistics files (relative to installation's $PGDATA).
69  * ----------
70  */
71 #define PGSTAT_STAT_PERMANENT_FILENAME          "global/pgstat.stat"
72 #define PGSTAT_STAT_PERMANENT_TMPFILE           "global/pgstat.tmp"
73
74 /* ----------
75  * Timer definitions.
76  * ----------
77  */
78 #define PGSTAT_STAT_INTERVAL    500             /* Minimum time between stats file
79                                                                                  * updates; in milliseconds. */
80
81 #define PGSTAT_RETRY_DELAY              10              /* How long to wait between statistics
82                                                                                  * update requests; in milliseconds. */
83
84 #define PGSTAT_MAX_WAIT_TIME    5000    /* Maximum time to wait for a stats
85                                                                                  * file update; in milliseconds. */
86
87 #define PGSTAT_RESTART_INTERVAL 60              /* How often to attempt to restart a
88                                                                                  * failed statistics collector; in
89                                                                                  * seconds. */
90
91 #define PGSTAT_SELECT_TIMEOUT   2               /* How often to check for postmaster
92                                                                                  * death; in seconds. */
93
94 #define PGSTAT_POLL_LOOP_COUNT  (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY)
95
96
97 /* ----------
98  * The initial size hints for the hash tables used in the collector.
99  * ----------
100  */
101 #define PGSTAT_DB_HASH_SIZE             16
102 #define PGSTAT_TAB_HASH_SIZE    512
103 #define PGSTAT_FUNCTION_HASH_SIZE       512
104
105
106 /* ----------
107  * GUC parameters
108  * ----------
109  */
110 bool            pgstat_track_activities = false;
111 bool            pgstat_track_counts = false;
112 int                     pgstat_track_functions = TRACK_FUNC_OFF;
113 int                     pgstat_track_activity_query_size = 1024;
114
115 /* ----------
116  * Built from GUC parameter
117  * ----------
118  */
119 char       *pgstat_stat_filename = NULL;
120 char       *pgstat_stat_tmpname = NULL;
121
122 /*
123  * BgWriter global statistics counters (unused in other processes).
124  * Stored directly in a stats message structure so it can be sent
125  * without needing to copy things around.  We assume this inits to zeroes.
126  */
127 PgStat_MsgBgWriter BgWriterStats;
128
129 /* ----------
130  * Local data
131  * ----------
132  */
133 NON_EXEC_STATIC int pgStatSock = -1;
134
135 static struct sockaddr_storage pgStatAddr;
136
137 static time_t last_pgstat_start_time;
138
139 static bool pgStatRunningInCollector = false;
140
141 /*
142  * Structures in which backends store per-table info that's waiting to be
143  * sent to the collector.
144  *
145  * NOTE: once allocated, TabStatusArray structures are never moved or deleted
146  * for the life of the backend.  Also, we zero out the t_id fields of the
147  * contained PgStat_TableStatus structs whenever they are not actively in use.
148  * This allows relcache pgstat_info pointers to be treated as long-lived data,
149  * avoiding repeated searches in pgstat_initstats() when a relation is
150  * repeatedly opened during a transaction.
151  */
152 #define TABSTAT_QUANTUM         100 /* we alloc this many at a time */
153
154 typedef struct TabStatusArray
155 {
156         struct TabStatusArray *tsa_next;        /* link to next array, if any */
157         int                     tsa_used;               /* # entries currently used */
158         PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM];        /* per-table data */
159 } TabStatusArray;
160
161 static TabStatusArray *pgStatTabList = NULL;
162
163 /*
164  * Backends store per-function info that's waiting to be sent to the collector
165  * in this hash table (indexed by function OID).
166  */
167 static HTAB *pgStatFunctions = NULL;
168
169 /*
170  * Indicates if backend has some function stats that it hasn't yet
171  * sent to the collector.
172  */
173 static bool have_function_stats = false;
174
175 /*
176  * Tuple insertion/deletion counts for an open transaction can't be propagated
177  * into PgStat_TableStatus counters until we know if it is going to commit
178  * or abort.  Hence, we keep these counts in per-subxact structs that live
179  * in TopTransactionContext.  This data structure is designed on the assumption
180  * that subxacts won't usually modify very many tables.
181  */
182 typedef struct PgStat_SubXactStatus
183 {
184         int                     nest_level;             /* subtransaction nest level */
185         struct PgStat_SubXactStatus *prev;      /* higher-level subxact if any */
186         PgStat_TableXactStatus *first;          /* head of list for this subxact */
187 } PgStat_SubXactStatus;
188
189 static PgStat_SubXactStatus *pgStatXactStack = NULL;
190
191 static int      pgStatXactCommit = 0;
192 static int      pgStatXactRollback = 0;
193
194 /* Record that's written to 2PC state file when pgstat state is persisted */
195 typedef struct TwoPhasePgStatRecord
196 {
197         PgStat_Counter tuples_inserted;         /* tuples inserted in xact */
198         PgStat_Counter tuples_deleted;          /* tuples deleted in xact */
199         Oid                     t_id;                   /* table's OID */
200         bool            t_shared;               /* is it a shared catalog? */
201 } TwoPhasePgStatRecord;
202
203 /*
204  * Info about current "snapshot" of stats file
205  */
206 static MemoryContext pgStatLocalContext = NULL;
207 static HTAB *pgStatDBHash = NULL;
208 static PgBackendStatus *localBackendStatusTable = NULL;
209 static int      localNumBackends = 0;
210
211 /*
212  * Cluster wide statistics, kept in the stats collector.
213  * Contains statistics that are not collected per database
214  * or per table.
215  */
216 static PgStat_GlobalStats globalStats;
217
218 /* Last time the collector successfully wrote the stats file */
219 static TimestampTz last_statwrite;
220 /* Latest statistics request time from backends */
221 static TimestampTz last_statrequest;
222
223 static volatile bool need_exit = false;
224 static volatile bool got_SIGHUP = false;
225
226 /*
227  * Total time charged to functions so far in the current backend.
228  * We use this to help separate "self" and "other" time charges.
229  * (We assume this initializes to zero.)
230  */
231 static instr_time total_func_time;
232
233
234 /* ----------
235  * Local function forward declarations
236  * ----------
237  */
238 #ifdef EXEC_BACKEND
239 static pid_t pgstat_forkexec(void);
240 #endif
241
242 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]);
243 static void pgstat_exit(SIGNAL_ARGS);
244 static void pgstat_beshutdown_hook(int code, Datum arg);
245 static void pgstat_sighup_handler(SIGNAL_ARGS);
246
247 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
248 static void pgstat_write_statsfile(bool permanent);
249 static HTAB *pgstat_read_statsfile(Oid onlydb, bool permanent);
250 static void backend_read_statsfile(void);
251 static void pgstat_read_current_status(void);
252
253 static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
254 static void pgstat_send_funcstats(void);
255 static HTAB *pgstat_collect_oids(Oid catalogid);
256
257 static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
258
259 static void pgstat_setup_memcxt(void);
260
261 static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
262 static void pgstat_send(void *msg, int len);
263
264 static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len);
265 static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
266 static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
267 static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
268 static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
269 static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
270 static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
271 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
272 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
273 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
274 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
275
276
277 /* ------------------------------------------------------------
278  * Public functions called from postmaster follow
279  * ------------------------------------------------------------
280  */
281
282 /* ----------
283  * pgstat_init() -
284  *
285  *      Called from postmaster at startup. Create the resources required
286  *      by the statistics collector process.  If unable to do so, do not
287  *      fail --- better to let the postmaster start with stats collection
288  *      disabled.
289  * ----------
290  */
291 void
292 pgstat_init(void)
293 {
294         ACCEPT_TYPE_ARG3 alen;
295         struct addrinfo *addrs = NULL,
296                            *addr,
297                                 hints;
298         int                     ret;
299         fd_set          rset;
300         struct timeval tv;
301         char            test_byte;
302         int                     sel_res;
303         int                     tries = 0;
304
305 #define TESTBYTEVAL ((char) 199)
306
307         /*
308          * Create the UDP socket for sending and receiving statistic messages
309          */
310         hints.ai_flags = AI_PASSIVE;
311         hints.ai_family = PF_UNSPEC;
312         hints.ai_socktype = SOCK_DGRAM;
313         hints.ai_protocol = 0;
314         hints.ai_addrlen = 0;
315         hints.ai_addr = NULL;
316         hints.ai_canonname = NULL;
317         hints.ai_next = NULL;
318         ret = pg_getaddrinfo_all("localhost", NULL, &hints, &addrs);
319         if (ret || !addrs)
320         {
321                 ereport(LOG,
322                                 (errmsg("could not resolve \"localhost\": %s",
323                                                 gai_strerror(ret))));
324                 goto startup_failed;
325         }
326
327         /*
328          * On some platforms, pg_getaddrinfo_all() may return multiple addresses
329          * only one of which will actually work (eg, both IPv6 and IPv4 addresses
330          * when kernel will reject IPv6).  Worse, the failure may occur at the
331          * bind() or perhaps even connect() stage.      So we must loop through the
332          * results till we find a working combination. We will generate LOG
333          * messages, but no error, for bogus combinations.
334          */
335         for (addr = addrs; addr; addr = addr->ai_next)
336         {
337 #ifdef HAVE_UNIX_SOCKETS
338                 /* Ignore AF_UNIX sockets, if any are returned. */
339                 if (addr->ai_family == AF_UNIX)
340                         continue;
341 #endif
342
343                 if (++tries > 1)
344                         ereport(LOG,
345                         (errmsg("trying another address for the statistics collector")));
346
347                 /*
348                  * Create the socket.
349                  */
350                 if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) < 0)
351                 {
352                         ereport(LOG,
353                                         (errcode_for_socket_access(),
354                         errmsg("could not create socket for statistics collector: %m")));
355                         continue;
356                 }
357
358                 /*
359                  * Bind it to a kernel assigned port on localhost and get the assigned
360                  * port via getsockname().
361                  */
362                 if (bind(pgStatSock, addr->ai_addr, addr->ai_addrlen) < 0)
363                 {
364                         ereport(LOG,
365                                         (errcode_for_socket_access(),
366                           errmsg("could not bind socket for statistics collector: %m")));
367                         closesocket(pgStatSock);
368                         pgStatSock = -1;
369                         continue;
370                 }
371
372                 alen = sizeof(pgStatAddr);
373                 if (getsockname(pgStatSock, (struct sockaddr *) & pgStatAddr, &alen) < 0)
374                 {
375                         ereport(LOG,
376                                         (errcode_for_socket_access(),
377                                          errmsg("could not get address of socket for statistics collector: %m")));
378                         closesocket(pgStatSock);
379                         pgStatSock = -1;
380                         continue;
381                 }
382
383                 /*
384                  * Connect the socket to its own address.  This saves a few cycles by
385                  * not having to respecify the target address on every send. This also
386                  * provides a kernel-level check that only packets from this same
387                  * address will be received.
388                  */
389                 if (connect(pgStatSock, (struct sockaddr *) & pgStatAddr, alen) < 0)
390                 {
391                         ereport(LOG,
392                                         (errcode_for_socket_access(),
393                         errmsg("could not connect socket for statistics collector: %m")));
394                         closesocket(pgStatSock);
395                         pgStatSock = -1;
396                         continue;
397                 }
398
399                 /*
400                  * Try to send and receive a one-byte test message on the socket. This
401                  * is to catch situations where the socket can be created but will not
402                  * actually pass data (for instance, because kernel packet filtering
403                  * rules prevent it).
404                  */
405                 test_byte = TESTBYTEVAL;
406
407 retry1:
408                 if (send(pgStatSock, &test_byte, 1, 0) != 1)
409                 {
410                         if (errno == EINTR)
411                                 goto retry1;    /* if interrupted, just retry */
412                         ereport(LOG,
413                                         (errcode_for_socket_access(),
414                                          errmsg("could not send test message on socket for statistics collector: %m")));
415                         closesocket(pgStatSock);
416                         pgStatSock = -1;
417                         continue;
418                 }
419
420                 /*
421                  * There could possibly be a little delay before the message can be
422                  * received.  We arbitrarily allow up to half a second before deciding
423                  * it's broken.
424                  */
425                 for (;;)                                /* need a loop to handle EINTR */
426                 {
427                         FD_ZERO(&rset);
428                         FD_SET(pgStatSock, &rset);
429                         tv.tv_sec = 0;
430                         tv.tv_usec = 500000;
431                         sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv);
432                         if (sel_res >= 0 || errno != EINTR)
433                                 break;
434                 }
435                 if (sel_res < 0)
436                 {
437                         ereport(LOG,
438                                         (errcode_for_socket_access(),
439                                          errmsg("select() failed in statistics collector: %m")));
440                         closesocket(pgStatSock);
441                         pgStatSock = -1;
442                         continue;
443                 }
444                 if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset))
445                 {
446                         /*
447                          * This is the case we actually think is likely, so take pains to
448                          * give a specific message for it.
449                          *
450                          * errno will not be set meaningfully here, so don't use it.
451                          */
452                         ereport(LOG,
453                                         (errcode(ERRCODE_CONNECTION_FAILURE),
454                                          errmsg("test message did not get through on socket for statistics collector")));
455                         closesocket(pgStatSock);
456                         pgStatSock = -1;
457                         continue;
458                 }
459
460                 test_byte++;                    /* just make sure variable is changed */
461
462 retry2:
463                 if (recv(pgStatSock, &test_byte, 1, 0) != 1)
464                 {
465                         if (errno == EINTR)
466                                 goto retry2;    /* if interrupted, just retry */
467                         ereport(LOG,
468                                         (errcode_for_socket_access(),
469                                          errmsg("could not receive test message on socket for statistics collector: %m")));
470                         closesocket(pgStatSock);
471                         pgStatSock = -1;
472                         continue;
473                 }
474
475                 if (test_byte != TESTBYTEVAL)   /* strictly paranoia ... */
476                 {
477                         ereport(LOG,
478                                         (errcode(ERRCODE_INTERNAL_ERROR),
479                                          errmsg("incorrect test message transmission on socket for statistics collector")));
480                         closesocket(pgStatSock);
481                         pgStatSock = -1;
482                         continue;
483                 }
484
485                 /* If we get here, we have a working socket */
486                 break;
487         }
488
489         /* Did we find a working address? */
490         if (!addr || pgStatSock < 0)
491                 goto startup_failed;
492
493         /*
494          * Set the socket to non-blocking IO.  This ensures that if the collector
495          * falls behind, statistics messages will be discarded; backends won't
496          * block waiting to send messages to the collector.
497          */
498         if (!pg_set_noblock(pgStatSock))
499         {
500                 ereport(LOG,
501                                 (errcode_for_socket_access(),
502                                  errmsg("could not set statistics collector socket to nonblocking mode: %m")));
503                 goto startup_failed;
504         }
505
506         pg_freeaddrinfo_all(hints.ai_family, addrs);
507
508         return;
509
510 startup_failed:
511         ereport(LOG,
512           (errmsg("disabling statistics collector for lack of working socket")));
513
514         if (addrs)
515                 pg_freeaddrinfo_all(hints.ai_family, addrs);
516
517         if (pgStatSock >= 0)
518                 closesocket(pgStatSock);
519         pgStatSock = -1;
520
521         /*
522          * Adjust GUC variables to suppress useless activity, and for debugging
523          * purposes (seeing track_counts off is a clue that we failed here). We
524          * use PGC_S_OVERRIDE because there is no point in trying to turn it back
525          * on from postgresql.conf without a restart.
526          */
527         SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
528 }
529
530 /*
531  * pgstat_reset_all() -
532  *
533  * Remove the stats file.  This is currently used only if WAL
534  * recovery is needed after a crash.
535  */
536 void
537 pgstat_reset_all(void)
538 {
539         unlink(pgstat_stat_filename);
540         unlink(PGSTAT_STAT_PERMANENT_FILENAME);
541 }
542
543 #ifdef EXEC_BACKEND
544
545 /*
546  * pgstat_forkexec() -
547  *
548  * Format up the arglist for, then fork and exec, statistics collector process
549  */
550 static pid_t
551 pgstat_forkexec(void)
552 {
553         char       *av[10];
554         int                     ac = 0;
555
556         av[ac++] = "postgres";
557         av[ac++] = "--forkcol";
558         av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
559
560         av[ac] = NULL;
561         Assert(ac < lengthof(av));
562
563         return postmaster_forkexec(ac, av);
564 }
565 #endif   /* EXEC_BACKEND */
566
567
568 /*
569  * pgstat_start() -
570  *
571  *      Called from postmaster at startup or after an existing collector
572  *      died.  Attempt to fire up a fresh statistics collector.
573  *
574  *      Returns PID of child process, or 0 if fail.
575  *
576  *      Note: if fail, we will be called again from the postmaster main loop.
577  */
578 int
579 pgstat_start(void)
580 {
581         time_t          curtime;
582         pid_t           pgStatPid;
583
584         /*
585          * Check that the socket is there, else pgstat_init failed and we can do
586          * nothing useful.
587          */
588         if (pgStatSock < 0)
589                 return 0;
590
591         /*
592          * Do nothing if too soon since last collector start.  This is a safety
593          * valve to protect against continuous respawn attempts if the collector
594          * is dying immediately at launch.      Note that since we will be re-called
595          * from the postmaster main loop, we will get another chance later.
596          */
597         curtime = time(NULL);
598         if ((unsigned int) (curtime - last_pgstat_start_time) <
599                 (unsigned int) PGSTAT_RESTART_INTERVAL)
600                 return 0;
601         last_pgstat_start_time = curtime;
602
603         /*
604          * Okay, fork off the collector.
605          */
606 #ifdef EXEC_BACKEND
607         switch ((pgStatPid = pgstat_forkexec()))
608 #else
609         switch ((pgStatPid = fork_process()))
610 #endif
611         {
612                 case -1:
613                         ereport(LOG,
614                                         (errmsg("could not fork statistics collector: %m")));
615                         return 0;
616
617 #ifndef EXEC_BACKEND
618                 case 0:
619                         /* in postmaster child ... */
620                         /* Close the postmaster's sockets */
621                         ClosePostmasterPorts(false);
622
623                         /* Lose the postmaster's on-exit routines */
624                         on_exit_reset();
625
626                         /* Drop our connection to postmaster's shared memory, as well */
627                         PGSharedMemoryDetach();
628
629                         PgstatCollectorMain(0, NULL);
630                         break;
631 #endif
632
633                 default:
634                         return (int) pgStatPid;
635         }
636
637         /* shouldn't get here */
638         return 0;
639 }
640
641 void
642 allow_immediate_pgstat_restart(void)
643 {
644         last_pgstat_start_time = 0;
645 }
646
647 /* ------------------------------------------------------------
648  * Public functions used by backends follow
649  *------------------------------------------------------------
650  */
651
652
653 /* ----------
654  * pgstat_report_stat() -
655  *
656  *      Called from tcop/postgres.c to send the so far collected per-table
657  *      and function usage statistics to the collector.  Note that this is
658  *      called only when not within a transaction, so it is fair to use
659  *      transaction stop time as an approximation of current time.
660  * ----------
661  */
662 void
663 pgstat_report_stat(bool force)
664 {
665         /* we assume this inits to all zeroes: */
666         static const PgStat_TableCounts all_zeroes;
667         static TimestampTz last_report = 0;
668
669         TimestampTz now;
670         PgStat_MsgTabstat regular_msg;
671         PgStat_MsgTabstat shared_msg;
672         TabStatusArray *tsa;
673         int                     i;
674
675         /* Don't expend a clock check if nothing to do */
676         if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0)
677                 && !have_function_stats)
678                 return;
679
680         /*
681          * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
682          * msec since we last sent one, or the caller wants to force stats out.
683          */
684         now = GetCurrentTransactionStopTimestamp();
685         if (!force &&
686                 !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL))
687                 return;
688         last_report = now;
689
690         /*
691          * Scan through the TabStatusArray struct(s) to find tables that actually
692          * have counts, and build messages to send.  We have to separate shared
693          * relations from regular ones because the databaseid field in the message
694          * header has to depend on that.
695          */
696         regular_msg.m_databaseid = MyDatabaseId;
697         shared_msg.m_databaseid = InvalidOid;
698         regular_msg.m_nentries = 0;
699         shared_msg.m_nentries = 0;
700
701         for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
702         {
703                 for (i = 0; i < tsa->tsa_used; i++)
704                 {
705                         PgStat_TableStatus *entry = &tsa->tsa_entries[i];
706                         PgStat_MsgTabstat *this_msg;
707                         PgStat_TableEntry *this_ent;
708
709                         /* Shouldn't have any pending transaction-dependent counts */
710                         Assert(entry->trans == NULL);
711
712                         /*
713                          * Ignore entries that didn't accumulate any actual counts, such
714                          * as indexes that were opened by the planner but not used.
715                          */
716                         if (memcmp(&entry->t_counts, &all_zeroes,
717                                            sizeof(PgStat_TableCounts)) == 0)
718                                 continue;
719
720                         /*
721                          * OK, insert data into the appropriate message, and send if full.
722                          */
723                         this_msg = entry->t_shared ? &shared_msg : &regular_msg;
724                         this_ent = &this_msg->m_entry[this_msg->m_nentries];
725                         this_ent->t_id = entry->t_id;
726                         memcpy(&this_ent->t_counts, &entry->t_counts,
727                                    sizeof(PgStat_TableCounts));
728                         if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
729                         {
730                                 pgstat_send_tabstat(this_msg);
731                                 this_msg->m_nentries = 0;
732                         }
733                 }
734                 /* zero out TableStatus structs after use */
735                 MemSet(tsa->tsa_entries, 0,
736                            tsa->tsa_used * sizeof(PgStat_TableStatus));
737                 tsa->tsa_used = 0;
738         }
739
740         /*
741          * Send partial messages.  If force is true, make sure that any pending
742          * xact commit/abort gets counted, even if no table stats to send.
743          */
744         if (regular_msg.m_nentries > 0 ||
745                 (force && (pgStatXactCommit > 0 || pgStatXactRollback > 0)))
746                 pgstat_send_tabstat(&regular_msg);
747         if (shared_msg.m_nentries > 0)
748                 pgstat_send_tabstat(&shared_msg);
749
750         /* Now, send function statistics */
751         pgstat_send_funcstats();
752 }
753
754 /*
755  * Subroutine for pgstat_report_stat: finish and send a tabstat message
756  */
757 static void
758 pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
759 {
760         int                     n;
761         int                     len;
762
763         /* It's unlikely we'd get here with no socket, but maybe not impossible */
764         if (pgStatSock < 0)
765                 return;
766
767         /*
768          * Report accumulated xact commit/rollback whenever we send a normal
769          * tabstat message
770          */
771         if (OidIsValid(tsmsg->m_databaseid))
772         {
773                 tsmsg->m_xact_commit = pgStatXactCommit;
774                 tsmsg->m_xact_rollback = pgStatXactRollback;
775                 pgStatXactCommit = 0;
776                 pgStatXactRollback = 0;
777         }
778         else
779         {
780                 tsmsg->m_xact_commit = 0;
781                 tsmsg->m_xact_rollback = 0;
782         }
783
784         n = tsmsg->m_nentries;
785         len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
786                 n * sizeof(PgStat_TableEntry);
787
788         pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
789         pgstat_send(tsmsg, len);
790 }
791
792 /*
793  * Subroutine for pgstat_report_stat: populate and send a function stat message
794  */
795 static void
796 pgstat_send_funcstats(void)
797 {
798         /* we assume this inits to all zeroes: */
799         static const PgStat_FunctionCounts all_zeroes;
800
801         PgStat_MsgFuncstat msg;
802         PgStat_BackendFunctionEntry *entry;
803         HASH_SEQ_STATUS fstat;
804
805         if (pgStatFunctions == NULL)
806                 return;
807
808         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FUNCSTAT);
809         msg.m_databaseid = MyDatabaseId;
810         msg.m_nentries = 0;
811
812         hash_seq_init(&fstat, pgStatFunctions);
813         while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL)
814         {
815                 PgStat_FunctionEntry *m_ent;
816
817                 /* Skip it if no counts accumulated since last time */
818                 if (memcmp(&entry->f_counts, &all_zeroes,
819                                    sizeof(PgStat_FunctionCounts)) == 0)
820                         continue;
821
822                 /* need to convert format of time accumulators */
823                 m_ent = &msg.m_entry[msg.m_nentries];
824                 m_ent->f_id = entry->f_id;
825                 m_ent->f_numcalls = entry->f_counts.f_numcalls;
826                 m_ent->f_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_time);
827                 m_ent->f_time_self = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_time_self);
828
829                 if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES)
830                 {
831                         pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
832                                                 msg.m_nentries * sizeof(PgStat_FunctionEntry));
833                         msg.m_nentries = 0;
834                 }
835
836                 /* reset the entry's counts */
837                 MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts));
838         }
839
840         if (msg.m_nentries > 0)
841                 pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
842                                         msg.m_nentries * sizeof(PgStat_FunctionEntry));
843
844         have_function_stats = false;
845 }
846
847
848 /* ----------
849  * pgstat_vacuum_stat() -
850  *
851  *      Will tell the collector about objects he can get rid of.
852  * ----------
853  */
854 void
855 pgstat_vacuum_stat(void)
856 {
857         HTAB       *htab;
858         PgStat_MsgTabpurge msg;
859         PgStat_MsgFuncpurge f_msg;
860         HASH_SEQ_STATUS hstat;
861         PgStat_StatDBEntry *dbentry;
862         PgStat_StatTabEntry *tabentry;
863         PgStat_StatFuncEntry *funcentry;
864         int                     len;
865
866         if (pgStatSock < 0)
867                 return;
868
869         /*
870          * If not done for this transaction, read the statistics collector stats
871          * file into some hash tables.
872          */
873         backend_read_statsfile();
874
875         /*
876          * Read pg_database and make a list of OIDs of all existing databases
877          */
878         htab = pgstat_collect_oids(DatabaseRelationId);
879
880         /*
881          * Search the database hash table for dead databases and tell the
882          * collector to drop them.
883          */
884         hash_seq_init(&hstat, pgStatDBHash);
885         while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
886         {
887                 Oid                     dbid = dbentry->databaseid;
888
889                 CHECK_FOR_INTERRUPTS();
890
891                 /* the DB entry for shared tables (with InvalidOid) is never dropped */
892                 if (OidIsValid(dbid) &&
893                         hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL)
894                         pgstat_drop_database(dbid);
895         }
896
897         /* Clean up */
898         hash_destroy(htab);
899
900         /*
901          * Lookup our own database entry; if not found, nothing more to do.
902          */
903         dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
904                                                                                                  (void *) &MyDatabaseId,
905                                                                                                  HASH_FIND, NULL);
906         if (dbentry == NULL || dbentry->tables == NULL)
907                 return;
908
909         /*
910          * Similarly to above, make a list of all known relations in this DB.
911          */
912         htab = pgstat_collect_oids(RelationRelationId);
913
914         /*
915          * Initialize our messages table counter to zero
916          */
917         msg.m_nentries = 0;
918
919         /*
920          * Check for all tables listed in stats hashtable if they still exist.
921          */
922         hash_seq_init(&hstat, dbentry->tables);
923         while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
924         {
925                 Oid                     tabid = tabentry->tableid;
926
927                 CHECK_FOR_INTERRUPTS();
928
929                 if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL)
930                         continue;
931
932                 /*
933                  * Not there, so add this table's Oid to the message
934                  */
935                 msg.m_tableid[msg.m_nentries++] = tabid;
936
937                 /*
938                  * If the message is full, send it out and reinitialize to empty
939                  */
940                 if (msg.m_nentries >= PGSTAT_NUM_TABPURGE)
941                 {
942                         len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
943                                 +msg.m_nentries * sizeof(Oid);
944
945                         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
946                         msg.m_databaseid = MyDatabaseId;
947                         pgstat_send(&msg, len);
948
949                         msg.m_nentries = 0;
950                 }
951         }
952
953         /*
954          * Send the rest
955          */
956         if (msg.m_nentries > 0)
957         {
958                 len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
959                         +msg.m_nentries * sizeof(Oid);
960
961                 pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
962                 msg.m_databaseid = MyDatabaseId;
963                 pgstat_send(&msg, len);
964         }
965
966         /* Clean up */
967         hash_destroy(htab);
968
969         /*
970          * Now repeat the above steps for functions.  However, we needn't bother
971          * in the common case where no function stats are being collected.
972          */
973         if (dbentry->functions != NULL &&
974                 hash_get_num_entries(dbentry->functions) > 0)
975         {
976                 htab = pgstat_collect_oids(ProcedureRelationId);
977
978                 pgstat_setheader(&f_msg.m_hdr, PGSTAT_MTYPE_FUNCPURGE);
979                 f_msg.m_databaseid = MyDatabaseId;
980                 f_msg.m_nentries = 0;
981
982                 hash_seq_init(&hstat, dbentry->functions);
983                 while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL)
984                 {
985                         Oid                     funcid = funcentry->functionid;
986
987                         CHECK_FOR_INTERRUPTS();
988
989                         if (hash_search(htab, (void *) &funcid, HASH_FIND, NULL) != NULL)
990                                 continue;
991
992                         /*
993                          * Not there, so add this function's Oid to the message
994                          */
995                         f_msg.m_functionid[f_msg.m_nentries++] = funcid;
996
997                         /*
998                          * If the message is full, send it out and reinitialize to empty
999                          */
1000                         if (f_msg.m_nentries >= PGSTAT_NUM_FUNCPURGE)
1001                         {
1002                                 len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
1003                                         +f_msg.m_nentries * sizeof(Oid);
1004
1005                                 pgstat_send(&f_msg, len);
1006
1007                                 f_msg.m_nentries = 0;
1008                         }
1009                 }
1010
1011                 /*
1012                  * Send the rest
1013                  */
1014                 if (f_msg.m_nentries > 0)
1015                 {
1016                         len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
1017                                 +f_msg.m_nentries * sizeof(Oid);
1018
1019                         pgstat_send(&f_msg, len);
1020                 }
1021
1022                 hash_destroy(htab);
1023         }
1024 }
1025
1026
1027 /* ----------
1028  * pgstat_collect_oids() -
1029  *
1030  *      Collect the OIDs of all objects listed in the specified system catalog
1031  *      into a temporary hash table.  Caller should hash_destroy the result
1032  *      when done with it.
1033  * ----------
1034  */
1035 static HTAB *
1036 pgstat_collect_oids(Oid catalogid)
1037 {
1038         HTAB       *htab;
1039         HASHCTL         hash_ctl;
1040         Relation        rel;
1041         HeapScanDesc scan;
1042         HeapTuple       tup;
1043
1044         memset(&hash_ctl, 0, sizeof(hash_ctl));
1045         hash_ctl.keysize = sizeof(Oid);
1046         hash_ctl.entrysize = sizeof(Oid);
1047         hash_ctl.hash = oid_hash;
1048         htab = hash_create("Temporary table of OIDs",
1049                                            PGSTAT_TAB_HASH_SIZE,
1050                                            &hash_ctl,
1051                                            HASH_ELEM | HASH_FUNCTION);
1052
1053         rel = heap_open(catalogid, AccessShareLock);
1054         scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
1055         while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
1056         {
1057                 Oid                     thisoid = HeapTupleGetOid(tup);
1058
1059                 CHECK_FOR_INTERRUPTS();
1060
1061                 (void) hash_search(htab, (void *) &thisoid, HASH_ENTER, NULL);
1062         }
1063         heap_endscan(scan);
1064         heap_close(rel, AccessShareLock);
1065
1066         return htab;
1067 }
1068
1069
1070 /* ----------
1071  * pgstat_drop_database() -
1072  *
1073  *      Tell the collector that we just dropped a database.
1074  *      (If the message gets lost, we will still clean the dead DB eventually
1075  *      via future invocations of pgstat_vacuum_stat().)
1076  * ----------
1077  */
1078 void
1079 pgstat_drop_database(Oid databaseid)
1080 {
1081         PgStat_MsgDropdb msg;
1082
1083         if (pgStatSock < 0)
1084                 return;
1085
1086         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB);
1087         msg.m_databaseid = databaseid;
1088         pgstat_send(&msg, sizeof(msg));
1089 }
1090
1091
1092 /* ----------
1093  * pgstat_drop_relation() -
1094  *
1095  *      Tell the collector that we just dropped a relation.
1096  *      (If the message gets lost, we will still clean the dead entry eventually
1097  *      via future invocations of pgstat_vacuum_stat().)
1098  *
1099  *      Currently not used for lack of any good place to call it; we rely
1100  *      entirely on pgstat_vacuum_stat() to clean out stats for dead rels.
1101  * ----------
1102  */
1103 #ifdef NOT_USED
1104 void
1105 pgstat_drop_relation(Oid relid)
1106 {
1107         PgStat_MsgTabpurge msg;
1108         int                     len;
1109
1110         if (pgStatSock < 0)
1111                 return;
1112
1113         msg.m_tableid[0] = relid;
1114         msg.m_nentries = 1;
1115
1116         len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) +sizeof(Oid);
1117
1118         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
1119         msg.m_databaseid = MyDatabaseId;
1120         pgstat_send(&msg, len);
1121 }
1122 #endif   /* NOT_USED */
1123
1124
1125 /* ----------
1126  * pgstat_reset_counters() -
1127  *
1128  *      Tell the statistics collector to reset counters for our database.
1129  * ----------
1130  */
1131 void
1132 pgstat_reset_counters(void)
1133 {
1134         PgStat_MsgResetcounter msg;
1135
1136         if (pgStatSock < 0)
1137                 return;
1138
1139         if (!superuser())
1140                 ereport(ERROR,
1141                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
1142                                  errmsg("must be superuser to reset statistics counters")));
1143
1144         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETCOUNTER);
1145         msg.m_databaseid = MyDatabaseId;
1146         pgstat_send(&msg, sizeof(msg));
1147 }
1148
1149
1150 /* ----------
1151  * pgstat_report_autovac() -
1152  *
1153  *      Called from autovacuum.c to report startup of an autovacuum process.
1154  *      We are called before InitPostgres is done, so can't rely on MyDatabaseId;
1155  *      the db OID must be passed in, instead.
1156  * ----------
1157  */
1158 void
1159 pgstat_report_autovac(Oid dboid)
1160 {
1161         PgStat_MsgAutovacStart msg;
1162
1163         if (pgStatSock < 0)
1164                 return;
1165
1166         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
1167         msg.m_databaseid = dboid;
1168         msg.m_start_time = GetCurrentTimestamp();
1169
1170         pgstat_send(&msg, sizeof(msg));
1171 }
1172
1173
1174 /* ---------
1175  * pgstat_report_vacuum() -
1176  *
1177  *      Tell the collector about the table we just vacuumed.
1178  * ---------
1179  */
1180 void
1181 pgstat_report_vacuum(Oid tableoid, bool shared, bool scanned_all,
1182                                          bool analyze, PgStat_Counter tuples)
1183 {
1184         PgStat_MsgVacuum msg;
1185
1186         if (pgStatSock < 0 || !pgstat_track_counts)
1187                 return;
1188
1189         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
1190         msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
1191         msg.m_tableoid = tableoid;
1192         msg.m_scanned_all = scanned_all;
1193         msg.m_analyze = analyze;
1194         msg.m_autovacuum = IsAutoVacuumWorkerProcess();         /* is this autovacuum? */
1195         msg.m_vacuumtime = GetCurrentTimestamp();
1196         msg.m_tuples = tuples;
1197         pgstat_send(&msg, sizeof(msg));
1198 }
1199
1200 /* --------
1201  * pgstat_report_analyze() -
1202  *
1203  *      Tell the collector about the table we just analyzed.
1204  * --------
1205  */
1206 void
1207 pgstat_report_analyze(Relation rel, PgStat_Counter livetuples,
1208                                           PgStat_Counter deadtuples)
1209 {
1210         PgStat_MsgAnalyze msg;
1211
1212         if (pgStatSock < 0 || !pgstat_track_counts)
1213                 return;
1214
1215         /*
1216          * Unlike VACUUM, ANALYZE might be running inside a transaction that
1217          * has already inserted and/or deleted rows in the target table.
1218          * ANALYZE will have counted such rows as live or dead respectively.
1219          * Because we will report our counts of such rows at transaction end,
1220          * we should subtract off these counts from what we send to the collector
1221          * now, else they'll be double-counted after commit.  (This approach also
1222          * ensures that the collector ends up with the right numbers if we abort
1223          * instead of committing.)
1224          */
1225         if (rel->pgstat_info != NULL)
1226         {
1227                 PgStat_TableXactStatus *trans;
1228
1229                 for (trans = rel->pgstat_info->trans; trans; trans = trans->upper)
1230                 {
1231                         livetuples -= trans->tuples_inserted - trans->tuples_deleted;
1232                         deadtuples -= trans->tuples_deleted;
1233                 }
1234                 /* count stuff inserted by already-aborted subxacts, too */
1235                 deadtuples -= rel->pgstat_info->t_counts.t_new_dead_tuples;
1236                 /* Since ANALYZE's counts are estimates, we could have underflowed */
1237                 livetuples = Max(livetuples, 0);
1238                 deadtuples = Max(deadtuples, 0);
1239         }
1240
1241         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
1242         msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
1243         msg.m_tableoid = RelationGetRelid(rel);
1244         msg.m_autovacuum = IsAutoVacuumWorkerProcess(); /* is this autovacuum? */
1245         msg.m_analyzetime = GetCurrentTimestamp();
1246         msg.m_live_tuples = livetuples;
1247         msg.m_dead_tuples = deadtuples;
1248         pgstat_send(&msg, sizeof(msg));
1249 }
1250
1251
1252 /* ----------
1253  * pgstat_ping() -
1254  *
1255  *      Send some junk data to the collector to increase traffic.
1256  * ----------
1257  */
1258 void
1259 pgstat_ping(void)
1260 {
1261         PgStat_MsgDummy msg;
1262
1263         if (pgStatSock < 0)
1264                 return;
1265
1266         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
1267         pgstat_send(&msg, sizeof(msg));
1268 }
1269
1270 /* ----------
1271  * pgstat_send_inquiry() -
1272  *
1273  *      Notify collector that we need fresh data.
1274  *      ts specifies the minimum acceptable timestamp for the stats file.
1275  * ----------
1276  */
1277 static void
1278 pgstat_send_inquiry(TimestampTz ts)
1279 {
1280         PgStat_MsgInquiry msg;
1281
1282         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY);
1283         msg.inquiry_time = ts;
1284         pgstat_send(&msg, sizeof(msg));
1285 }
1286
1287
1288 /*
1289  * Initialize function call usage data.
1290  * Called by the executor before invoking a function.
1291  */
1292 void
1293 pgstat_init_function_usage(FunctionCallInfoData *fcinfo,
1294                                                    PgStat_FunctionCallUsage *fcu)
1295 {
1296         PgStat_BackendFunctionEntry *htabent;
1297         bool            found;
1298
1299         if (pgstat_track_functions <= fcinfo->flinfo->fn_stats)
1300         {
1301                 /* stats not wanted */
1302                 fcu->fs = NULL;
1303                 return;
1304         }
1305
1306         if (!pgStatFunctions)
1307         {
1308                 /* First time through - initialize function stat table */
1309                 HASHCTL         hash_ctl;
1310
1311                 memset(&hash_ctl, 0, sizeof(hash_ctl));
1312                 hash_ctl.keysize = sizeof(Oid);
1313                 hash_ctl.entrysize = sizeof(PgStat_BackendFunctionEntry);
1314                 hash_ctl.hash = oid_hash;
1315                 pgStatFunctions = hash_create("Function stat entries",
1316                                                                           PGSTAT_FUNCTION_HASH_SIZE,
1317                                                                           &hash_ctl,
1318                                                                           HASH_ELEM | HASH_FUNCTION);
1319         }
1320
1321         /* Get the stats entry for this function, create if necessary */
1322         htabent = hash_search(pgStatFunctions, &fcinfo->flinfo->fn_oid,
1323                                                   HASH_ENTER, &found);
1324         if (!found)
1325                 MemSet(&htabent->f_counts, 0, sizeof(PgStat_FunctionCounts));
1326
1327         fcu->fs = &htabent->f_counts;
1328
1329         /* save stats for this function, later used to compensate for recursion */
1330         fcu->save_f_time = htabent->f_counts.f_time;
1331
1332         /* save current backend-wide total time */
1333         fcu->save_total = total_func_time;
1334
1335         /* get clock time as of function start */
1336         INSTR_TIME_SET_CURRENT(fcu->f_start);
1337 }
1338
1339 /*
1340  * Calculate function call usage and update stat counters.
1341  * Called by the executor after invoking a function.
1342  *
1343  * In the case of a set-returning function that runs in value-per-call mode,
1344  * we will see multiple pgstat_init_function_usage/pgstat_end_function_usage
1345  * calls for what the user considers a single call of the function.  The
1346  * finalize flag should be TRUE on the last call.
1347  */
1348 void
1349 pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize)
1350 {
1351         PgStat_FunctionCounts *fs = fcu->fs;
1352         instr_time      f_total;
1353         instr_time      f_others;
1354         instr_time      f_self;
1355
1356         /* stats not wanted? */
1357         if (fs == NULL)
1358                 return;
1359
1360         /* total elapsed time in this function call */
1361         INSTR_TIME_SET_CURRENT(f_total);
1362         INSTR_TIME_SUBTRACT(f_total, fcu->f_start);
1363
1364         /* self usage: elapsed minus anything already charged to other calls */
1365         f_others = total_func_time;
1366         INSTR_TIME_SUBTRACT(f_others, fcu->save_total);
1367         f_self = f_total;
1368         INSTR_TIME_SUBTRACT(f_self, f_others);
1369
1370         /* update backend-wide total time */
1371         INSTR_TIME_ADD(total_func_time, f_self);
1372
1373         /*
1374          * Compute the new total f_time as the total elapsed time added to the
1375          * pre-call value of f_time.  This is necessary to avoid double-counting
1376          * any time taken by recursive calls of myself.  (We do not need any
1377          * similar kluge for self time, since that already excludes any
1378          * recursive calls.)
1379          */
1380         INSTR_TIME_ADD(f_total, fcu->save_f_time);
1381
1382         /* update counters in function stats table */
1383         if (finalize)
1384                 fs->f_numcalls++;
1385         fs->f_time = f_total;
1386         INSTR_TIME_ADD(fs->f_time_self, f_self);
1387
1388         /* indicate that we have something to send */
1389         have_function_stats = true;
1390 }
1391
1392
1393 /* ----------
1394  * pgstat_initstats() -
1395  *
1396  *      Initialize a relcache entry to count access statistics.
1397  *      Called whenever a relation is opened.
1398  *
1399  *      We assume that a relcache entry's pgstat_info field is zeroed by
1400  *      relcache.c when the relcache entry is made; thereafter it is long-lived
1401  *      data.  We can avoid repeated searches of the TabStatus arrays when the
1402  *      same relation is touched repeatedly within a transaction.
1403  * ----------
1404  */
1405 void
1406 pgstat_initstats(Relation rel)
1407 {
1408         Oid                     rel_id = rel->rd_id;
1409         char            relkind = rel->rd_rel->relkind;
1410
1411         /* We only count stats for things that have storage */
1412         if (!(relkind == RELKIND_RELATION ||
1413                   relkind == RELKIND_INDEX ||
1414                   relkind == RELKIND_TOASTVALUE))
1415         {
1416                 rel->pgstat_info = NULL;
1417                 return;
1418         }
1419
1420         if (pgStatSock < 0 || !pgstat_track_counts)
1421         {
1422                 /* We're not counting at all */
1423                 rel->pgstat_info = NULL;
1424                 return;
1425         }
1426
1427         /*
1428          * If we already set up this relation in the current transaction, nothing
1429          * to do.
1430          */
1431         if (rel->pgstat_info != NULL &&
1432                 rel->pgstat_info->t_id == rel_id)
1433                 return;
1434
1435         /* Else find or make the PgStat_TableStatus entry, and update link */
1436         rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
1437 }
1438
1439 /*
1440  * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
1441  */
1442 static PgStat_TableStatus *
1443 get_tabstat_entry(Oid rel_id, bool isshared)
1444 {
1445         PgStat_TableStatus *entry;
1446         TabStatusArray *tsa;
1447         TabStatusArray *prev_tsa;
1448         int                     i;
1449
1450         /*
1451          * Search the already-used tabstat slots for this relation.
1452          */
1453         prev_tsa = NULL;
1454         for (tsa = pgStatTabList; tsa != NULL; prev_tsa = tsa, tsa = tsa->tsa_next)
1455         {
1456                 for (i = 0; i < tsa->tsa_used; i++)
1457                 {
1458                         entry = &tsa->tsa_entries[i];
1459                         if (entry->t_id == rel_id)
1460                                 return entry;
1461                 }
1462
1463                 if (tsa->tsa_used < TABSTAT_QUANTUM)
1464                 {
1465                         /*
1466                          * It must not be present, but we found a free slot instead. Fine,
1467                          * let's use this one.  We assume the entry was already zeroed,
1468                          * either at creation or after last use.
1469                          */
1470                         entry = &tsa->tsa_entries[tsa->tsa_used++];
1471                         entry->t_id = rel_id;
1472                         entry->t_shared = isshared;
1473                         return entry;
1474                 }
1475         }
1476
1477         /*
1478          * We ran out of tabstat slots, so allocate more.  Be sure they're zeroed.
1479          */
1480         tsa = (TabStatusArray *) MemoryContextAllocZero(TopMemoryContext,
1481                                                                                                         sizeof(TabStatusArray));
1482         if (prev_tsa)
1483                 prev_tsa->tsa_next = tsa;
1484         else
1485                 pgStatTabList = tsa;
1486
1487         /*
1488          * Use the first entry of the new TabStatusArray.
1489          */
1490         entry = &tsa->tsa_entries[tsa->tsa_used++];
1491         entry->t_id = rel_id;
1492         entry->t_shared = isshared;
1493         return entry;
1494 }
1495
1496 /*
1497  * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed
1498  */
1499 static PgStat_SubXactStatus *
1500 get_tabstat_stack_level(int nest_level)
1501 {
1502         PgStat_SubXactStatus *xact_state;
1503
1504         xact_state = pgStatXactStack;
1505         if (xact_state == NULL || xact_state->nest_level != nest_level)
1506         {
1507                 xact_state = (PgStat_SubXactStatus *)
1508                         MemoryContextAlloc(TopTransactionContext,
1509                                                            sizeof(PgStat_SubXactStatus));
1510                 xact_state->nest_level = nest_level;
1511                 xact_state->prev = pgStatXactStack;
1512                 xact_state->first = NULL;
1513                 pgStatXactStack = xact_state;
1514         }
1515         return xact_state;
1516 }
1517
1518 /*
1519  * add_tabstat_xact_level - add a new (sub)transaction state record
1520  */
1521 static void
1522 add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
1523 {
1524         PgStat_SubXactStatus *xact_state;
1525         PgStat_TableXactStatus *trans;
1526
1527         /*
1528          * If this is the first rel to be modified at the current nest level, we
1529          * first have to push a transaction stack entry.
1530          */
1531         xact_state = get_tabstat_stack_level(nest_level);
1532
1533         /* Now make a per-table stack entry */
1534         trans = (PgStat_TableXactStatus *)
1535                 MemoryContextAllocZero(TopTransactionContext,
1536                                                            sizeof(PgStat_TableXactStatus));
1537         trans->nest_level = nest_level;
1538         trans->upper = pgstat_info->trans;
1539         trans->parent = pgstat_info;
1540         trans->next = xact_state->first;
1541         xact_state->first = trans;
1542         pgstat_info->trans = trans;
1543 }
1544
1545 /*
1546  * pgstat_count_heap_insert - count a tuple insertion
1547  */
1548 void
1549 pgstat_count_heap_insert(Relation rel)
1550 {
1551         PgStat_TableStatus *pgstat_info = rel->pgstat_info;
1552
1553         if (pgstat_track_counts && pgstat_info != NULL)
1554         {
1555                 int                     nest_level = GetCurrentTransactionNestLevel();
1556
1557                 /* t_tuples_inserted is nontransactional, so just advance it */
1558                 pgstat_info->t_counts.t_tuples_inserted++;
1559
1560                 /* We have to log the transactional effect at the proper level */
1561                 if (pgstat_info->trans == NULL ||
1562                         pgstat_info->trans->nest_level != nest_level)
1563                         add_tabstat_xact_level(pgstat_info, nest_level);
1564
1565                 pgstat_info->trans->tuples_inserted++;
1566         }
1567 }
1568
1569 /*
1570  * pgstat_count_heap_update - count a tuple update
1571  */
1572 void
1573 pgstat_count_heap_update(Relation rel, bool hot)
1574 {
1575         PgStat_TableStatus *pgstat_info = rel->pgstat_info;
1576
1577         if (pgstat_track_counts && pgstat_info != NULL)
1578         {
1579                 int                     nest_level = GetCurrentTransactionNestLevel();
1580
1581                 /* t_tuples_updated is nontransactional, so just advance it */
1582                 pgstat_info->t_counts.t_tuples_updated++;
1583                 /* ditto for the hot_update counter */
1584                 if (hot)
1585                         pgstat_info->t_counts.t_tuples_hot_updated++;
1586
1587                 /* We have to log the transactional effect at the proper level */
1588                 if (pgstat_info->trans == NULL ||
1589                         pgstat_info->trans->nest_level != nest_level)
1590                         add_tabstat_xact_level(pgstat_info, nest_level);
1591
1592                 /* An UPDATE both inserts a new tuple and deletes the old */
1593                 pgstat_info->trans->tuples_inserted++;
1594                 pgstat_info->trans->tuples_deleted++;
1595         }
1596 }
1597
1598 /*
1599  * pgstat_count_heap_delete - count a tuple deletion
1600  */
1601 void
1602 pgstat_count_heap_delete(Relation rel)
1603 {
1604         PgStat_TableStatus *pgstat_info = rel->pgstat_info;
1605
1606         if (pgstat_track_counts && pgstat_info != NULL)
1607         {
1608                 int                     nest_level = GetCurrentTransactionNestLevel();
1609
1610                 /* t_tuples_deleted is nontransactional, so just advance it */
1611                 pgstat_info->t_counts.t_tuples_deleted++;
1612
1613                 /* We have to log the transactional effect at the proper level */
1614                 if (pgstat_info->trans == NULL ||
1615                         pgstat_info->trans->nest_level != nest_level)
1616                         add_tabstat_xact_level(pgstat_info, nest_level);
1617
1618                 pgstat_info->trans->tuples_deleted++;
1619         }
1620 }
1621
1622 /*
1623  * pgstat_update_heap_dead_tuples - update dead-tuples count
1624  *
1625  * The semantics of this are that we are reporting the nontransactional
1626  * recovery of "delta" dead tuples; so t_new_dead_tuples decreases
1627  * rather than increasing, and the change goes straight into the per-table
1628  * counter, not into transactional state.
1629  */
1630 void
1631 pgstat_update_heap_dead_tuples(Relation rel, int delta)
1632 {
1633         PgStat_TableStatus *pgstat_info = rel->pgstat_info;
1634
1635         if (pgstat_track_counts && pgstat_info != NULL)
1636                 pgstat_info->t_counts.t_new_dead_tuples -= delta;
1637 }
1638
1639
1640 /* ----------
1641  * AtEOXact_PgStat
1642  *
1643  *      Called from access/transam/xact.c at top-level transaction commit/abort.
1644  * ----------
1645  */
1646 void
1647 AtEOXact_PgStat(bool isCommit)
1648 {
1649         PgStat_SubXactStatus *xact_state;
1650
1651         /*
1652          * Count transaction commit or abort.  (We use counters, not just bools,
1653          * in case the reporting message isn't sent right away.)
1654          */
1655         if (isCommit)
1656                 pgStatXactCommit++;
1657         else
1658                 pgStatXactRollback++;
1659
1660         /*
1661          * Transfer transactional insert/update counts into the base tabstat
1662          * entries.  We don't bother to free any of the transactional state, since
1663          * it's all in TopTransactionContext and will go away anyway.
1664          */
1665         xact_state = pgStatXactStack;
1666         if (xact_state != NULL)
1667         {
1668                 PgStat_TableXactStatus *trans;
1669
1670                 Assert(xact_state->nest_level == 1);
1671                 Assert(xact_state->prev == NULL);
1672                 for (trans = xact_state->first; trans != NULL; trans = trans->next)
1673                 {
1674                         PgStat_TableStatus *tabstat;
1675
1676                         Assert(trans->nest_level == 1);
1677                         Assert(trans->upper == NULL);
1678                         tabstat = trans->parent;
1679                         Assert(tabstat->trans == trans);
1680                         if (isCommit)
1681                         {
1682                                 tabstat->t_counts.t_new_live_tuples +=
1683                                         trans->tuples_inserted - trans->tuples_deleted;
1684                                 tabstat->t_counts.t_new_dead_tuples += trans->tuples_deleted;
1685                         }
1686                         else
1687                         {
1688                                 /* inserted tuples are dead, deleted tuples are unaffected */
1689                                 tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
1690                         }
1691                         tabstat->trans = NULL;
1692                 }
1693         }
1694         pgStatXactStack = NULL;
1695
1696         /* Make sure any stats snapshot is thrown away */
1697         pgstat_clear_snapshot();
1698 }
1699
1700 /* ----------
1701  * AtEOSubXact_PgStat
1702  *
1703  *      Called from access/transam/xact.c at subtransaction commit/abort.
1704  * ----------
1705  */
1706 void
1707 AtEOSubXact_PgStat(bool isCommit, int nestDepth)
1708 {
1709         PgStat_SubXactStatus *xact_state;
1710
1711         /*
1712          * Transfer transactional insert/update counts into the next higher
1713          * subtransaction state.
1714          */
1715         xact_state = pgStatXactStack;
1716         if (xact_state != NULL &&
1717                 xact_state->nest_level >= nestDepth)
1718         {
1719                 PgStat_TableXactStatus *trans;
1720                 PgStat_TableXactStatus *next_trans;
1721
1722                 /* delink xact_state from stack immediately to simplify reuse case */
1723                 pgStatXactStack = xact_state->prev;
1724
1725                 for (trans = xact_state->first; trans != NULL; trans = next_trans)
1726                 {
1727                         PgStat_TableStatus *tabstat;
1728
1729                         next_trans = trans->next;
1730                         Assert(trans->nest_level == nestDepth);
1731                         tabstat = trans->parent;
1732                         Assert(tabstat->trans == trans);
1733                         if (isCommit)
1734                         {
1735                                 if (trans->upper && trans->upper->nest_level == nestDepth - 1)
1736                                 {
1737                                         trans->upper->tuples_inserted += trans->tuples_inserted;
1738                                         trans->upper->tuples_deleted += trans->tuples_deleted;
1739                                         tabstat->trans = trans->upper;
1740                                         pfree(trans);
1741                                 }
1742                                 else
1743                                 {
1744                                         /*
1745                                          * When there isn't an immediate parent state, we can just
1746                                          * reuse the record instead of going through a
1747                                          * palloc/pfree pushup (this works since it's all in
1748                                          * TopTransactionContext anyway).  We have to re-link it
1749                                          * into the parent level, though, and that might mean
1750                                          * pushing a new entry into the pgStatXactStack.
1751                                          */
1752                                         PgStat_SubXactStatus *upper_xact_state;
1753
1754                                         upper_xact_state = get_tabstat_stack_level(nestDepth - 1);
1755                                         trans->next = upper_xact_state->first;
1756                                         upper_xact_state->first = trans;
1757                                         trans->nest_level = nestDepth - 1;
1758                                 }
1759                         }
1760                         else
1761                         {
1762                                 /*
1763                                  * On abort, inserted tuples are dead (and can be bounced out
1764                                  * to the top-level tabstat), deleted tuples are unaffected
1765                                  */
1766                                 tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
1767                                 tabstat->trans = trans->upper;
1768                                 pfree(trans);
1769                         }
1770                 }
1771                 pfree(xact_state);
1772         }
1773 }
1774
1775
1776 /*
1777  * AtPrepare_PgStat
1778  *              Save the transactional stats state at 2PC transaction prepare.
1779  *
1780  * In this phase we just generate 2PC records for all the pending
1781  * transaction-dependent stats work.
1782  */
1783 void
1784 AtPrepare_PgStat(void)
1785 {
1786         PgStat_SubXactStatus *xact_state;
1787
1788         xact_state = pgStatXactStack;
1789         if (xact_state != NULL)
1790         {
1791                 PgStat_TableXactStatus *trans;
1792
1793                 Assert(xact_state->nest_level == 1);
1794                 Assert(xact_state->prev == NULL);
1795                 for (trans = xact_state->first; trans != NULL; trans = trans->next)
1796                 {
1797                         PgStat_TableStatus *tabstat;
1798                         TwoPhasePgStatRecord record;
1799
1800                         Assert(trans->nest_level == 1);
1801                         Assert(trans->upper == NULL);
1802                         tabstat = trans->parent;
1803                         Assert(tabstat->trans == trans);
1804
1805                         record.tuples_inserted = trans->tuples_inserted;
1806                         record.tuples_deleted = trans->tuples_deleted;
1807                         record.t_id = tabstat->t_id;
1808                         record.t_shared = tabstat->t_shared;
1809
1810                         RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0,
1811                                                                    &record, sizeof(TwoPhasePgStatRecord));
1812                 }
1813         }
1814 }
1815
1816 /*
1817  * PostPrepare_PgStat
1818  *              Clean up after successful PREPARE.
1819  *
1820  * All we need do here is unlink the transaction stats state from the
1821  * nontransactional state.      The nontransactional action counts will be
1822  * reported to the stats collector immediately, while the effects on live
1823  * and dead tuple counts are preserved in the 2PC state file.
1824  *
1825  * Note: AtEOXact_PgStat is not called during PREPARE.
1826  */
1827 void
1828 PostPrepare_PgStat(void)
1829 {
1830         PgStat_SubXactStatus *xact_state;
1831
1832         /*
1833          * We don't bother to free any of the transactional state, since it's all
1834          * in TopTransactionContext and will go away anyway.
1835          */
1836         xact_state = pgStatXactStack;
1837         if (xact_state != NULL)
1838         {
1839                 PgStat_TableXactStatus *trans;
1840
1841                 for (trans = xact_state->first; trans != NULL; trans = trans->next)
1842                 {
1843                         PgStat_TableStatus *tabstat;
1844
1845                         tabstat = trans->parent;
1846                         tabstat->trans = NULL;
1847                 }
1848         }
1849         pgStatXactStack = NULL;
1850
1851         /* Make sure any stats snapshot is thrown away */
1852         pgstat_clear_snapshot();
1853 }
1854
1855 /*
1856  * 2PC processing routine for COMMIT PREPARED case.
1857  *
1858  * Load the saved counts into our local pgstats state.
1859  */
1860 void
1861 pgstat_twophase_postcommit(TransactionId xid, uint16 info,
1862                                                    void *recdata, uint32 len)
1863 {
1864         TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
1865         PgStat_TableStatus *pgstat_info;
1866
1867         /* Find or create a tabstat entry for the rel */
1868         pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
1869
1870         pgstat_info->t_counts.t_new_live_tuples +=
1871                 rec->tuples_inserted - rec->tuples_deleted;
1872         pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_deleted;
1873 }
1874
1875 /*
1876  * 2PC processing routine for ROLLBACK PREPARED case.
1877  *
1878  * Load the saved counts into our local pgstats state, but treat them
1879  * as aborted.
1880  */
1881 void
1882 pgstat_twophase_postabort(TransactionId xid, uint16 info,
1883                                                   void *recdata, uint32 len)
1884 {
1885         TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
1886         PgStat_TableStatus *pgstat_info;
1887
1888         /* Find or create a tabstat entry for the rel */
1889         pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
1890
1891         /* inserted tuples are dead, deleted tuples are no-ops */
1892         pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_inserted;
1893 }
1894
1895
1896 /* ----------
1897  * pgstat_fetch_stat_dbentry() -
1898  *
1899  *      Support function for the SQL-callable pgstat* functions. Returns
1900  *      the collected statistics for one database or NULL. NULL doesn't mean
1901  *      that the database doesn't exist, it is just not yet known by the
1902  *      collector, so the caller is better off to report ZERO instead.
1903  * ----------
1904  */
1905 PgStat_StatDBEntry *
1906 pgstat_fetch_stat_dbentry(Oid dbid)
1907 {
1908         /*
1909          * If not done for this transaction, read the statistics collector stats
1910          * file into some hash tables.
1911          */
1912         backend_read_statsfile();
1913
1914         /*
1915          * Lookup the requested database; return NULL if not found
1916          */
1917         return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
1918                                                                                           (void *) &dbid,
1919                                                                                           HASH_FIND, NULL);
1920 }
1921
1922
1923 /* ----------
1924  * pgstat_fetch_stat_tabentry() -
1925  *
1926  *      Support function for the SQL-callable pgstat* functions. Returns
1927  *      the collected statistics for one table or NULL. NULL doesn't mean
1928  *      that the table doesn't exist, it is just not yet known by the
1929  *      collector, so the caller is better off to report ZERO instead.
1930  * ----------
1931  */
1932 PgStat_StatTabEntry *
1933 pgstat_fetch_stat_tabentry(Oid relid)
1934 {
1935         Oid                     dbid;
1936         PgStat_StatDBEntry *dbentry;
1937         PgStat_StatTabEntry *tabentry;
1938
1939         /*
1940          * If not done for this transaction, read the statistics collector stats
1941          * file into some hash tables.
1942          */
1943         backend_read_statsfile();
1944
1945         /*
1946          * Lookup our database, then look in its table hash table.
1947          */
1948         dbid = MyDatabaseId;
1949         dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
1950                                                                                                  (void *) &dbid,
1951                                                                                                  HASH_FIND, NULL);
1952         if (dbentry != NULL && dbentry->tables != NULL)
1953         {
1954                 tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
1955                                                                                                            (void *) &relid,
1956                                                                                                            HASH_FIND, NULL);
1957                 if (tabentry)
1958                         return tabentry;
1959         }
1960
1961         /*
1962          * If we didn't find it, maybe it's a shared table.
1963          */
1964         dbid = InvalidOid;
1965         dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
1966                                                                                                  (void *) &dbid,
1967                                                                                                  HASH_FIND, NULL);
1968         if (dbentry != NULL && dbentry->tables != NULL)
1969         {
1970                 tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
1971                                                                                                            (void *) &relid,
1972                                                                                                            HASH_FIND, NULL);
1973                 if (tabentry)
1974                         return tabentry;
1975         }
1976
1977         return NULL;
1978 }
1979
1980
1981 /* ----------
1982  * pgstat_fetch_stat_funcentry() -
1983  *
1984  *      Support function for the SQL-callable pgstat* functions. Returns
1985  *      the collected statistics for one function or NULL.
1986  * ----------
1987  */
1988 PgStat_StatFuncEntry *
1989 pgstat_fetch_stat_funcentry(Oid func_id)
1990 {
1991         PgStat_StatDBEntry *dbentry;
1992         PgStat_StatFuncEntry *funcentry = NULL;
1993
1994         /* load the stats file if needed */
1995         backend_read_statsfile();
1996
1997         /* Lookup our database, then find the requested function.  */
1998         dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
1999         if (dbentry != NULL && dbentry->functions != NULL)
2000         {
2001                 funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
2002                                                                                                                  (void *) &func_id,
2003                                                                                                                  HASH_FIND, NULL);
2004         }
2005
2006         return funcentry;
2007 }
2008
2009
2010 /* ----------
2011  * pgstat_fetch_stat_beentry() -
2012  *
2013  *      Support function for the SQL-callable pgstat* functions. Returns
2014  *      our local copy of the current-activity entry for one backend.
2015  *
2016  *      NB: caller is responsible for a check if the user is permitted to see
2017  *      this info (especially the querystring).
2018  * ----------
2019  */
2020 PgBackendStatus *
2021 pgstat_fetch_stat_beentry(int beid)
2022 {
2023         pgstat_read_current_status();
2024
2025         if (beid < 1 || beid > localNumBackends)
2026                 return NULL;
2027
2028         return &localBackendStatusTable[beid - 1];
2029 }
2030
2031
2032 /* ----------
2033  * pgstat_fetch_stat_numbackends() -
2034  *
2035  *      Support function for the SQL-callable pgstat* functions. Returns
2036  *      the maximum current backend id.
2037  * ----------
2038  */
2039 int
2040 pgstat_fetch_stat_numbackends(void)
2041 {
2042         pgstat_read_current_status();
2043
2044         return localNumBackends;
2045 }
2046
2047 /*
2048  * ---------
2049  * pgstat_fetch_global() -
2050  *
2051  *      Support function for the SQL-callable pgstat* functions. Returns
2052  *      a pointer to the global statistics struct.
2053  * ---------
2054  */
2055 PgStat_GlobalStats *
2056 pgstat_fetch_global(void)
2057 {
2058         backend_read_statsfile();
2059
2060         return &globalStats;
2061 }
2062
2063
2064 /* ------------------------------------------------------------
2065  * Functions for management of the shared-memory PgBackendStatus array
2066  * ------------------------------------------------------------
2067  */
2068
2069 static PgBackendStatus *BackendStatusArray = NULL;
2070 static PgBackendStatus *MyBEEntry = NULL;
2071 static char                        *BackendActivityBuffer = NULL;
2072
2073
2074 /*
2075  * Report shared-memory space needed by CreateSharedBackendStatus.
2076  */
2077 Size
2078 BackendStatusShmemSize(void)
2079 {
2080         Size            size;
2081
2082         size = add_size(mul_size(sizeof(PgBackendStatus), MaxBackends),
2083                                         mul_size(pgstat_track_activity_query_size, MaxBackends));
2084         return size;
2085 }
2086
2087 /*
2088  * Initialize the shared status array and activity string buffer during
2089  * postmaster startup.
2090  */
2091 void
2092 CreateSharedBackendStatus(void)
2093 {
2094         Size            size;
2095         bool            found;
2096         int                     i;
2097         char       *buffer;
2098
2099         /* Create or attach to the shared array */
2100         size = mul_size(sizeof(PgBackendStatus), MaxBackends);
2101         BackendStatusArray = (PgBackendStatus *)
2102                 ShmemInitStruct("Backend Status Array", size, &found);
2103
2104         if (!found)
2105         {
2106                 /*
2107                  * We're the first - initialize.
2108                  */
2109                 MemSet(BackendStatusArray, 0, size);
2110         }
2111
2112         /* Create or attach to the shared activity buffer */
2113         size = mul_size(pgstat_track_activity_query_size, MaxBackends);
2114         BackendActivityBuffer = (char*)
2115                 ShmemInitStruct("Backend Activity Buffer", size, &found);
2116
2117         if (!found)
2118         {
2119                 MemSet(BackendActivityBuffer, 0, size);
2120
2121                 /* Initialize st_activity pointers. */
2122                 buffer = BackendActivityBuffer;
2123                 for (i = 0; i < MaxBackends; i++) {
2124                         BackendStatusArray[i].st_activity = buffer;
2125                         buffer += pgstat_track_activity_query_size;
2126                 }
2127         }
2128 }
2129
2130
2131 /* ----------
2132  * pgstat_initialize() -
2133  *
2134  *      Initialize pgstats state, and set up our on-proc-exit hook.
2135  *      Called from InitPostgres.  MyBackendId must be set,
2136  *      but we must not have started any transaction yet (since the
2137  *      exit hook must run after the last transaction exit).
2138  * ----------
2139  */
2140 void
2141 pgstat_initialize(void)
2142 {
2143         /* Initialize MyBEEntry */
2144         Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends);
2145         MyBEEntry = &BackendStatusArray[MyBackendId - 1];
2146
2147         /* Set up a process-exit hook to clean up */
2148         on_shmem_exit(pgstat_beshutdown_hook, 0);
2149 }
2150
2151 /* ----------
2152  * pgstat_bestart() -
2153  *
2154  *      Initialize this backend's entry in the PgBackendStatus array.
2155  *      Called from InitPostgres.  MyDatabaseId and session userid must be set
2156  *      (hence, this cannot be combined with pgstat_initialize).
2157  * ----------
2158  */
2159 void
2160 pgstat_bestart(void)
2161 {
2162         TimestampTz proc_start_timestamp;
2163         Oid                     userid;
2164         SockAddr        clientaddr;
2165         volatile PgBackendStatus *beentry;
2166
2167         /*
2168          * To minimize the time spent modifying the PgBackendStatus entry, fetch
2169          * all the needed data first.
2170          *
2171          * If we have a MyProcPort, use its session start time (for consistency,
2172          * and to save a kernel call).
2173          */
2174         if (MyProcPort)
2175                 proc_start_timestamp = MyProcPort->SessionStartTime;
2176         else
2177                 proc_start_timestamp = GetCurrentTimestamp();
2178         userid = GetSessionUserId();
2179
2180         /*
2181          * We may not have a MyProcPort (eg, if this is the autovacuum process).
2182          * If so, use all-zeroes client address, which is dealt with specially in
2183          * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port.
2184          */
2185         if (MyProcPort)
2186                 memcpy(&clientaddr, &MyProcPort->raddr, sizeof(clientaddr));
2187         else
2188                 MemSet(&clientaddr, 0, sizeof(clientaddr));
2189
2190         /*
2191          * Initialize my status entry, following the protocol of bumping
2192          * st_changecount before and after; and make sure it's even afterwards. We
2193          * use a volatile pointer here to ensure the compiler doesn't try to get
2194          * cute.
2195          */
2196         beentry = MyBEEntry;
2197         do
2198         {
2199                 beentry->st_changecount++;
2200         } while ((beentry->st_changecount & 1) == 0);
2201
2202         beentry->st_procpid = MyProcPid;
2203         beentry->st_proc_start_timestamp = proc_start_timestamp;
2204         beentry->st_activity_start_timestamp = 0;
2205         beentry->st_xact_start_timestamp = 0;
2206         beentry->st_databaseid = MyDatabaseId;
2207         beentry->st_userid = userid;
2208         beentry->st_clientaddr = clientaddr;
2209         beentry->st_waiting = false;
2210         beentry->st_activity[0] = '\0';
2211         /* Also make sure the last byte in the string area is always 0 */
2212         beentry->st_activity[pgstat_track_activity_query_size - 1] = '\0';
2213
2214         beentry->st_changecount++;
2215         Assert((beentry->st_changecount & 1) == 0);
2216 }
2217
2218 /*
2219  * Shut down a single backend's statistics reporting at process exit.
2220  *
2221  * Flush any remaining statistics counts out to the collector.
2222  * Without this, operations triggered during backend exit (such as
2223  * temp table deletions) won't be counted.
2224  *
2225  * Lastly, clear out our entry in the PgBackendStatus array.
2226  */
2227 static void
2228 pgstat_beshutdown_hook(int code, Datum arg)
2229 {
2230         volatile PgBackendStatus *beentry = MyBEEntry;
2231
2232         pgstat_report_stat(true);
2233
2234         /*
2235          * Clear my status entry, following the protocol of bumping st_changecount
2236          * before and after.  We use a volatile pointer here to ensure the
2237          * compiler doesn't try to get cute.
2238          */
2239         beentry->st_changecount++;
2240
2241         beentry->st_procpid = 0;        /* mark invalid */
2242
2243         beentry->st_changecount++;
2244         Assert((beentry->st_changecount & 1) == 0);
2245 }
2246
2247
2248 /* ----------
2249  * pgstat_report_activity() -
2250  *
2251  *      Called from tcop/postgres.c to report what the backend is actually doing
2252  *      (usually "<IDLE>" or the start of the query to be executed).
2253  * ----------
2254  */
2255 void
2256 pgstat_report_activity(const char *cmd_str)
2257 {
2258         volatile PgBackendStatus *beentry = MyBEEntry;
2259         TimestampTz start_timestamp;
2260         int                     len;
2261
2262         TRACE_POSTGRESQL_STATEMENT_STATUS(cmd_str);
2263
2264         if (!pgstat_track_activities || !beentry)
2265                 return;
2266
2267         /*
2268          * To minimize the time spent modifying the entry, fetch all the needed
2269          * data first.
2270          */
2271         start_timestamp = GetCurrentStatementStartTimestamp();
2272
2273         len = strlen(cmd_str);
2274         len = pg_mbcliplen(cmd_str, len, pgstat_track_activity_query_size - 1);
2275
2276         /*
2277          * Update my status entry, following the protocol of bumping
2278          * st_changecount before and after.  We use a volatile pointer here to
2279          * ensure the compiler doesn't try to get cute.
2280          */
2281         beentry->st_changecount++;
2282
2283         beentry->st_activity_start_timestamp = start_timestamp;
2284         memcpy((char *) beentry->st_activity, cmd_str, len);
2285         beentry->st_activity[len] = '\0';
2286
2287         beentry->st_changecount++;
2288         Assert((beentry->st_changecount & 1) == 0);
2289 }
2290
2291 /*
2292  * Report current transaction start timestamp as the specified value.
2293  * Zero means there is no active transaction.
2294  */
2295 void
2296 pgstat_report_xact_timestamp(TimestampTz tstamp)
2297 {
2298         volatile PgBackendStatus *beentry = MyBEEntry;
2299
2300         if (!pgstat_track_activities || !beentry)
2301                 return;
2302
2303         /*
2304          * Update my status entry, following the protocol of bumping
2305          * st_changecount before and after.  We use a volatile pointer here to
2306          * ensure the compiler doesn't try to get cute.
2307          */
2308         beentry->st_changecount++;
2309         beentry->st_xact_start_timestamp = tstamp;
2310         beentry->st_changecount++;
2311         Assert((beentry->st_changecount & 1) == 0);
2312 }
2313
2314 /* ----------
2315  * pgstat_report_waiting() -
2316  *
2317  *      Called from lock manager to report beginning or end of a lock wait.
2318  *
2319  * NB: this *must* be able to survive being called before MyBEEntry has been
2320  * initialized.
2321  * ----------
2322  */
2323 void
2324 pgstat_report_waiting(bool waiting)
2325 {
2326         volatile PgBackendStatus *beentry = MyBEEntry;
2327
2328         if (!pgstat_track_activities || !beentry)
2329                 return;
2330
2331         /*
2332          * Since this is a single-byte field in a struct that only this process
2333          * may modify, there seems no need to bother with the st_changecount
2334          * protocol.  The update must appear atomic in any case.
2335          */
2336         beentry->st_waiting = waiting;
2337 }
2338
2339
2340 /* ----------
2341  * pgstat_read_current_status() -
2342  *
2343  *      Copy the current contents of the PgBackendStatus array to local memory,
2344  *      if not already done in this transaction.
2345  * ----------
2346  */
2347 static void
2348 pgstat_read_current_status(void)
2349 {
2350         volatile PgBackendStatus *beentry;
2351         PgBackendStatus *localtable;
2352         PgBackendStatus *localentry;
2353         char                    *localactivity;
2354         int                     i;
2355
2356         Assert(!pgStatRunningInCollector);
2357         if (localBackendStatusTable)
2358                 return;                                 /* already done */
2359
2360         pgstat_setup_memcxt();
2361
2362         localtable = (PgBackendStatus *)
2363                 MemoryContextAlloc(pgStatLocalContext,
2364                                                    sizeof(PgBackendStatus) * MaxBackends);
2365         localactivity = (char *)
2366                 MemoryContextAlloc(pgStatLocalContext,
2367                                                    pgstat_track_activity_query_size * MaxBackends);
2368         localNumBackends = 0;
2369
2370         beentry = BackendStatusArray;
2371         localentry = localtable;
2372         for (i = 1; i <= MaxBackends; i++)
2373         {
2374                 /*
2375                  * Follow the protocol of retrying if st_changecount changes while we
2376                  * copy the entry, or if it's odd.  (The check for odd is needed to
2377                  * cover the case where we are able to completely copy the entry while
2378                  * the source backend is between increment steps.)      We use a volatile
2379                  * pointer here to ensure the compiler doesn't try to get cute.
2380                  */
2381                 for (;;)
2382                 {
2383                         int                     save_changecount = beentry->st_changecount;
2384
2385                         localentry->st_procpid = beentry->st_procpid;
2386                         if (localentry->st_procpid > 0)
2387                         {
2388                                 memcpy(localentry, (char *) beentry, sizeof(PgBackendStatus));
2389                                 /*
2390                                  * strcpy is safe even if the string is modified concurrently,
2391                                  * because there's always a \0 at the end of the buffer.
2392                                  */
2393                                 strcpy(localactivity, (char *) beentry->st_activity);
2394                                 localentry->st_activity = localactivity;
2395                         }
2396
2397                         if (save_changecount == beentry->st_changecount &&
2398                                 (save_changecount & 1) == 0)
2399                                 break;
2400
2401                         /* Make sure we can break out of loop if stuck... */
2402                         CHECK_FOR_INTERRUPTS();
2403                 }
2404
2405                 beentry++;
2406                 /* Only valid entries get included into the local array */
2407                 if (localentry->st_procpid > 0)
2408                 {
2409                         localentry++;
2410                         localactivity += pgstat_track_activity_query_size;
2411                         localNumBackends++;
2412                 }
2413         }
2414
2415         /* Set the pointer only after completion of a valid table */
2416         localBackendStatusTable = localtable;
2417 }
2418
2419
2420 /* ----------
2421  * pgstat_get_backend_current_activity() -
2422  *
2423  *      Return a string representing the current activity of the backend with
2424  *      the specified PID.  This looks directly at the BackendStatusArray,
2425  *      and so will provide current information regardless of the age of our
2426  *      transaction's snapshot of the status array.
2427  *
2428  *      It is the caller's responsibility to invoke this only for backends whose
2429  *      state is expected to remain stable while the result is in use.  The
2430  *      only current use is in deadlock reporting, where we can expect that
2431  *      the target backend is blocked on a lock.  (There are corner cases
2432  *      where the target's wait could get aborted while we are looking at it,
2433  *      but the very worst consequence is to return a pointer to a string
2434  *      that's been changed, so we won't worry too much.)
2435  *
2436  *      Note: return strings for special cases match pg_stat_get_backend_activity.
2437  * ----------
2438  */
2439 const char *
2440 pgstat_get_backend_current_activity(int pid, bool checkUser)
2441 {
2442         PgBackendStatus *beentry;
2443         int                     i;
2444
2445         beentry = BackendStatusArray;
2446         for (i = 1; i <= MaxBackends; i++)
2447         {
2448                 /*
2449                  * Although we expect the target backend's entry to be stable, that
2450                  * doesn't imply that anyone else's is.  To avoid identifying the
2451                  * wrong backend, while we check for a match to the desired PID we
2452                  * must follow the protocol of retrying if st_changecount changes
2453                  * while we examine the entry, or if it's odd.  (This might be
2454                  * unnecessary, since fetching or storing an int is almost certainly
2455                  * atomic, but let's play it safe.)  We use a volatile pointer here
2456                  * to ensure the compiler doesn't try to get cute.
2457                  */
2458                 volatile PgBackendStatus *vbeentry = beentry;
2459                 bool    found;
2460
2461                 for (;;)
2462                 {
2463                         int                     save_changecount = vbeentry->st_changecount;
2464
2465                         found = (vbeentry->st_procpid == pid);
2466
2467                         if (save_changecount == vbeentry->st_changecount &&
2468                                 (save_changecount & 1) == 0)
2469                                 break;
2470
2471                         /* Make sure we can break out of loop if stuck... */
2472                         CHECK_FOR_INTERRUPTS();
2473                 }
2474
2475                 if (found)
2476                 {
2477                         /* Now it is safe to use the non-volatile pointer */
2478                         if (checkUser && !superuser() && beentry->st_userid != GetUserId())
2479                                 return "<insufficient privilege>";
2480                         else if (*(beentry->st_activity) == '\0')
2481                                 return "<command string not enabled>";
2482                         else
2483                                 return beentry->st_activity;
2484                 }
2485
2486                 beentry++;
2487         }
2488
2489         /* If we get here, caller is in error ... */
2490         return "<backend information not available>";
2491 }
2492
2493
2494 /* ------------------------------------------------------------
2495  * Local support functions follow
2496  * ------------------------------------------------------------
2497  */
2498
2499
2500 /* ----------
2501  * pgstat_setheader() -
2502  *
2503  *              Set common header fields in a statistics message
2504  * ----------
2505  */
2506 static void
2507 pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype)
2508 {
2509         hdr->m_type = mtype;
2510 }
2511
2512
2513 /* ----------
2514  * pgstat_send() -
2515  *
2516  *              Send out one statistics message to the collector
2517  * ----------
2518  */
2519 static void
2520 pgstat_send(void *msg, int len)
2521 {
2522         int                     rc;
2523
2524         if (pgStatSock < 0)
2525                 return;
2526
2527         ((PgStat_MsgHdr *) msg)->m_size = len;
2528
2529         /* We'll retry after EINTR, but ignore all other failures */
2530         do
2531         {
2532                 rc = send(pgStatSock, msg, len, 0);
2533         } while (rc < 0 && errno == EINTR);
2534
2535 #ifdef USE_ASSERT_CHECKING
2536         /* In debug builds, log send failures ... */
2537         if (rc < 0)
2538                 elog(LOG, "could not send to statistics collector: %m");
2539 #endif
2540 }
2541
2542 /* ----------
2543  * pgstat_send_bgwriter() -
2544  *
2545  *              Send bgwriter statistics to the collector
2546  * ----------
2547  */
2548 void
2549 pgstat_send_bgwriter(void)
2550 {
2551         /* We assume this initializes to zeroes */
2552         static const PgStat_MsgBgWriter all_zeroes;
2553
2554         /*
2555          * This function can be called even if nothing at all has happened. In
2556          * this case, avoid sending a completely empty message to the stats
2557          * collector.
2558          */
2559         if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
2560                 return;
2561
2562         /*
2563          * Prepare and send the message
2564          */
2565         pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
2566         pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
2567
2568         /*
2569          * Clear out the statistics buffer, so it can be re-used.
2570          */
2571         MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
2572 }
2573
2574
2575 /* ----------
2576  * PgstatCollectorMain() -
2577  *
2578  *      Start up the statistics collector process.      This is the body of the
2579  *      postmaster child process.
2580  *
2581  *      The argc/argv parameters are valid only in EXEC_BACKEND case.
2582  * ----------
2583  */
2584 NON_EXEC_STATIC void
2585 PgstatCollectorMain(int argc, char *argv[])
2586 {
2587         int                     len;
2588         PgStat_Msg      msg;
2589
2590 #ifndef WIN32
2591 #ifdef HAVE_POLL
2592         struct pollfd input_fd;
2593 #else
2594         struct timeval sel_timeout;
2595         fd_set          rfds;
2596 #endif
2597 #endif
2598
2599         IsUnderPostmaster = true;       /* we are a postmaster subprocess now */
2600
2601         MyProcPid = getpid();           /* reset MyProcPid */
2602
2603         MyStartTime = time(NULL);       /* record Start Time for logging */
2604
2605         /*
2606          * If possible, make this process a group leader, so that the postmaster
2607          * can signal any child processes too.  (pgstat probably never has any
2608          * child processes, but for consistency we make all postmaster child
2609          * processes do this.)
2610          */
2611 #ifdef HAVE_SETSID
2612         if (setsid() < 0)
2613                 elog(FATAL, "setsid() failed: %m");
2614 #endif
2615
2616         /*
2617          * Ignore all signals usually bound to some action in the postmaster,
2618          * except SIGQUIT.
2619          */
2620         pqsignal(SIGHUP, pgstat_sighup_handler);
2621         pqsignal(SIGINT, SIG_IGN);
2622         pqsignal(SIGTERM, SIG_IGN);
2623         pqsignal(SIGQUIT, pgstat_exit);
2624         pqsignal(SIGALRM, SIG_IGN);
2625         pqsignal(SIGPIPE, SIG_IGN);
2626         pqsignal(SIGUSR1, SIG_IGN);
2627         pqsignal(SIGUSR2, SIG_IGN);
2628         pqsignal(SIGCHLD, SIG_DFL);
2629         pqsignal(SIGTTIN, SIG_DFL);
2630         pqsignal(SIGTTOU, SIG_DFL);
2631         pqsignal(SIGCONT, SIG_DFL);
2632         pqsignal(SIGWINCH, SIG_DFL);
2633         PG_SETMASK(&UnBlockSig);
2634
2635         /*
2636          * Identify myself via ps
2637          */
2638         init_ps_display("stats collector process", "", "", "");
2639
2640         /*
2641          * Arrange to write the initial status file right away
2642          */
2643         last_statrequest = GetCurrentTimestamp();
2644         last_statwrite = last_statrequest - 1;
2645
2646         /*
2647          * Read in an existing statistics stats file or initialize the stats to
2648          * zero.
2649          */
2650         pgStatRunningInCollector = true;
2651         pgStatDBHash = pgstat_read_statsfile(InvalidOid, true);
2652
2653         /*
2654          * Setup the descriptor set for select(2).      Since only one bit in the set
2655          * ever changes, we need not repeat FD_ZERO each time.
2656          */
2657 #if !defined(HAVE_POLL) && !defined(WIN32)
2658         FD_ZERO(&rfds);
2659 #endif
2660
2661         /*
2662          * Loop to process messages until we get SIGQUIT or detect ungraceful
2663          * death of our parent postmaster.
2664          *
2665          * For performance reasons, we don't want to do a PostmasterIsAlive() test
2666          * after every message; instead, do it only when select()/poll() is
2667          * interrupted by timeout.  In essence, we'll stay alive as long as
2668          * backends keep sending us stuff often, even if the postmaster is gone.
2669          */
2670         for (;;)
2671         {
2672                 int                     got_data;
2673
2674                 /*
2675                  * Quit if we get SIGQUIT from the postmaster.
2676                  */
2677                 if (need_exit)
2678                         break;
2679
2680                 /*
2681                  * Reload configuration if we got SIGHUP from the postmaster.
2682                  */
2683                 if (got_SIGHUP)
2684                 {
2685                         ProcessConfigFile(PGC_SIGHUP);
2686                         got_SIGHUP = false;
2687                 }
2688
2689                 /*
2690                  * Write the stats file if a new request has arrived that is not
2691                  * satisfied by existing file.
2692                  */
2693                 if (last_statwrite < last_statrequest)
2694                         pgstat_write_statsfile(false);
2695
2696                 /*
2697                  * Wait for a message to arrive; but not for more than
2698                  * PGSTAT_SELECT_TIMEOUT seconds. (This determines how quickly we will
2699                  * shut down after an ungraceful postmaster termination; so it needn't
2700                  * be very fast.  However, on some systems SIGQUIT won't interrupt the
2701                  * poll/select call, so this also limits speed of response to SIGQUIT,
2702                  * which is more important.)
2703                  *
2704                  * We use poll(2) if available, otherwise select(2). Win32 has its own
2705                  * implementation.
2706                  */
2707 #ifndef WIN32
2708 #ifdef HAVE_POLL
2709                 input_fd.fd = pgStatSock;
2710                 input_fd.events = POLLIN | POLLERR;
2711                 input_fd.revents = 0;
2712
2713                 if (poll(&input_fd, 1, PGSTAT_SELECT_TIMEOUT * 1000) < 0)
2714                 {
2715                         if (errno == EINTR)
2716                                 continue;
2717                         ereport(ERROR,
2718                                         (errcode_for_socket_access(),
2719                                          errmsg("poll() failed in statistics collector: %m")));
2720                 }
2721
2722                 got_data = (input_fd.revents != 0);
2723 #else                                                   /* !HAVE_POLL */
2724
2725                 FD_SET(pgStatSock, &rfds);
2726
2727                 /*
2728                  * timeout struct is modified by select() on some operating systems,
2729                  * so re-fill it each time.
2730                  */
2731                 sel_timeout.tv_sec = PGSTAT_SELECT_TIMEOUT;
2732                 sel_timeout.tv_usec = 0;
2733
2734                 if (select(pgStatSock + 1, &rfds, NULL, NULL, &sel_timeout) < 0)
2735                 {
2736                         if (errno == EINTR)
2737                                 continue;
2738                         ereport(ERROR,
2739                                         (errcode_for_socket_access(),
2740                                          errmsg("select() failed in statistics collector: %m")));
2741                 }
2742
2743                 got_data = FD_ISSET(pgStatSock, &rfds);
2744 #endif   /* HAVE_POLL */
2745 #else                                                   /* WIN32 */
2746                 got_data = pgwin32_waitforsinglesocket(pgStatSock, FD_READ,
2747                                                                                            PGSTAT_SELECT_TIMEOUT * 1000);
2748 #endif
2749
2750                 /*
2751                  * If there is a message on the socket, read it and check for
2752                  * validity.
2753                  */
2754                 if (got_data)
2755                 {
2756                         len = recv(pgStatSock, (char *) &msg,
2757                                            sizeof(PgStat_Msg), 0);
2758                         if (len < 0)
2759                         {
2760                                 if (errno == EINTR)
2761                                         continue;
2762                                 ereport(ERROR,
2763                                                 (errcode_for_socket_access(),
2764                                                  errmsg("could not read statistics message: %m")));
2765                         }
2766
2767                         /*
2768                          * We ignore messages that are smaller than our common header
2769                          */
2770                         if (len < sizeof(PgStat_MsgHdr))
2771                                 continue;
2772
2773                         /*
2774                          * The received length must match the length in the header
2775                          */
2776                         if (msg.msg_hdr.m_size != len)
2777                                 continue;
2778
2779                         /*
2780                          * O.K. - we accept this message.  Process it.
2781                          */
2782                         switch (msg.msg_hdr.m_type)
2783                         {
2784                                 case PGSTAT_MTYPE_DUMMY:
2785                                         break;
2786
2787                                 case PGSTAT_MTYPE_INQUIRY:
2788                                         pgstat_recv_inquiry((PgStat_MsgInquiry *) &msg, len);
2789                                         break;
2790
2791                                 case PGSTAT_MTYPE_TABSTAT:
2792                                         pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, len);
2793                                         break;
2794
2795                                 case PGSTAT_MTYPE_TABPURGE:
2796                                         pgstat_recv_tabpurge((PgStat_MsgTabpurge *) &msg, len);
2797                                         break;
2798
2799                                 case PGSTAT_MTYPE_DROPDB:
2800                                         pgstat_recv_dropdb((PgStat_MsgDropdb *) &msg, len);
2801                                         break;
2802
2803                                 case PGSTAT_MTYPE_RESETCOUNTER:
2804                                         pgstat_recv_resetcounter((PgStat_MsgResetcounter *) &msg,
2805                                                                                          len);
2806                                         break;
2807
2808                                 case PGSTAT_MTYPE_AUTOVAC_START:
2809                                         pgstat_recv_autovac((PgStat_MsgAutovacStart *) &msg, len);
2810                                         break;
2811
2812                                 case PGSTAT_MTYPE_VACUUM:
2813                                         pgstat_recv_vacuum((PgStat_MsgVacuum *) &msg, len);
2814                                         break;
2815
2816                                 case PGSTAT_MTYPE_ANALYZE:
2817                                         pgstat_recv_analyze((PgStat_MsgAnalyze *) &msg, len);
2818                                         break;
2819
2820                                 case PGSTAT_MTYPE_BGWRITER:
2821                                         pgstat_recv_bgwriter((PgStat_MsgBgWriter *) &msg, len);
2822                                         break;
2823
2824                                 case PGSTAT_MTYPE_FUNCSTAT:
2825                                         pgstat_recv_funcstat((PgStat_MsgFuncstat *) &msg, len);
2826                                         break;
2827
2828                                 case PGSTAT_MTYPE_FUNCPURGE:
2829                                         pgstat_recv_funcpurge((PgStat_MsgFuncpurge *) &msg, len);
2830                                         break;
2831
2832                                 default:
2833                                         break;
2834                         }
2835                 }
2836                 else
2837                 {
2838                         /*
2839                          * We can only get here if the select/poll timeout elapsed. Check
2840                          * for postmaster death.
2841                          */
2842                         if (!PostmasterIsAlive(true))
2843                                 break;
2844                 }
2845         }                                                       /* end of message-processing loop */
2846
2847         /*
2848          * Save the final stats to reuse at next startup.
2849          */
2850         pgstat_write_statsfile(true);
2851
2852         exit(0);
2853 }
2854
2855
2856 /* SIGQUIT signal handler for collector process */
2857 static void
2858 pgstat_exit(SIGNAL_ARGS)
2859 {
2860         need_exit = true;
2861 }
2862
2863 /* SIGHUP handler for collector process */
2864 static void
2865 pgstat_sighup_handler(SIGNAL_ARGS)
2866 {
2867         got_SIGHUP = true;
2868 }
2869
2870
2871 /*
2872  * Lookup the hash table entry for the specified database. If no hash
2873  * table entry exists, initialize it, if the create parameter is true.
2874  * Else, return NULL.
2875  */
2876 static PgStat_StatDBEntry *
2877 pgstat_get_db_entry(Oid databaseid, bool create)
2878 {
2879         PgStat_StatDBEntry *result;
2880         bool            found;
2881         HASHACTION      action = (create ? HASH_ENTER : HASH_FIND);
2882
2883         /* Lookup or create the hash table entry for this database */
2884         result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
2885                                                                                                 &databaseid,
2886                                                                                                 action, &found);
2887
2888         if (!create && !found)
2889                 return NULL;
2890
2891         /* If not found, initialize the new one. */
2892         if (!found)
2893         {
2894                 HASHCTL         hash_ctl;
2895
2896                 result->tables = NULL;
2897                 result->functions = NULL;
2898                 result->n_xact_commit = 0;
2899                 result->n_xact_rollback = 0;
2900                 result->n_blocks_fetched = 0;
2901                 result->n_blocks_hit = 0;
2902                 result->n_tuples_returned = 0;
2903                 result->n_tuples_fetched = 0;
2904                 result->n_tuples_inserted = 0;
2905                 result->n_tuples_updated = 0;
2906                 result->n_tuples_deleted = 0;
2907                 result->last_autovac_time = 0;
2908
2909                 memset(&hash_ctl, 0, sizeof(hash_ctl));
2910                 hash_ctl.keysize = sizeof(Oid);
2911                 hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
2912                 hash_ctl.hash = oid_hash;
2913                 result->tables = hash_create("Per-database table",
2914                                                                          PGSTAT_TAB_HASH_SIZE,
2915                                                                          &hash_ctl,
2916                                                                          HASH_ELEM | HASH_FUNCTION);
2917
2918                 hash_ctl.keysize = sizeof(Oid);
2919                 hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
2920                 hash_ctl.hash = oid_hash;
2921                 result->functions = hash_create("Per-database function",
2922                                                                                 PGSTAT_FUNCTION_HASH_SIZE,
2923                                                                                 &hash_ctl,
2924                                                                                 HASH_ELEM | HASH_FUNCTION);
2925         }
2926
2927         return result;
2928 }
2929
2930
2931 /* ----------
2932  * pgstat_write_statsfile() -
2933  *
2934  *      Tell the news.
2935  *      If writing to the permanent file (happens when the collector is
2936  *      shutting down only), remove the temporary file so that backends
2937  *      starting up under a new postmaster can't read the old data before
2938  *      the new collector is ready.
2939  * ----------
2940  */
2941 static void
2942 pgstat_write_statsfile(bool permanent)
2943 {
2944         HASH_SEQ_STATUS hstat;
2945         HASH_SEQ_STATUS tstat;
2946         HASH_SEQ_STATUS fstat;
2947         PgStat_StatDBEntry *dbentry;
2948         PgStat_StatTabEntry *tabentry;
2949         PgStat_StatFuncEntry *funcentry;
2950         FILE       *fpout;
2951         int32           format_id;
2952         const char *tmpfile = permanent?PGSTAT_STAT_PERMANENT_TMPFILE:pgstat_stat_tmpname;
2953         const char *statfile = permanent?PGSTAT_STAT_PERMANENT_FILENAME:pgstat_stat_filename;
2954
2955         /*
2956          * Open the statistics temp file to write out the current values.
2957          */
2958         fpout = AllocateFile(tmpfile, PG_BINARY_W);
2959         if (fpout == NULL)
2960         {
2961                 ereport(LOG,
2962                                 (errcode_for_file_access(),
2963                                  errmsg("could not open temporary statistics file \"%s\": %m",
2964                                                 tmpfile)));
2965                 return;
2966         }
2967
2968         /*
2969          * Set the timestamp of the stats file.
2970          */
2971         globalStats.stats_timestamp = GetCurrentTimestamp();
2972
2973         /*
2974          * Write the file header --- currently just a format ID.
2975          */
2976         format_id = PGSTAT_FILE_FORMAT_ID;
2977         fwrite(&format_id, sizeof(format_id), 1, fpout);
2978
2979         /*
2980          * Write global stats struct
2981          */
2982         fwrite(&globalStats, sizeof(globalStats), 1, fpout);
2983
2984         /*
2985          * Walk through the database table.
2986          */
2987         hash_seq_init(&hstat, pgStatDBHash);
2988         while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
2989         {
2990                 /*
2991                  * Write out the DB entry including the number of live backends. We
2992                  * don't write the tables or functions pointers, since they're of
2993                  * no use to any other process.
2994                  */
2995                 fputc('D', fpout);
2996                 fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
2997
2998                 /*
2999                  * Walk through the database's access stats per table.
3000                  */
3001                 hash_seq_init(&tstat, dbentry->tables);
3002                 while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
3003                 {
3004                         fputc('T', fpout);
3005                         fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
3006                 }
3007
3008                 /*
3009                  * Walk through the database's function stats table.
3010                  */
3011                 hash_seq_init(&fstat, dbentry->functions);
3012                 while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
3013                 {
3014                         fputc('F', fpout);
3015                         fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
3016                 }
3017
3018                 /*
3019                  * Mark the end of this DB
3020                  */
3021                 fputc('d', fpout);
3022         }
3023
3024         /*
3025          * No more output to be done. Close the temp file and replace the old
3026          * pgstat.stat with it.  The ferror() check replaces testing for error
3027          * after each individual fputc or fwrite above.
3028          */
3029         fputc('E', fpout);
3030
3031         if (ferror(fpout))
3032         {
3033                 ereport(LOG,
3034                                 (errcode_for_file_access(),
3035                            errmsg("could not write temporary statistics file \"%s\": %m",
3036                                           tmpfile)));
3037                 FreeFile(fpout);
3038                 unlink(tmpfile);
3039         }
3040         else if (FreeFile(fpout) < 0)
3041         {
3042                 ereport(LOG,
3043                                 (errcode_for_file_access(),
3044                            errmsg("could not close temporary statistics file \"%s\": %m",
3045                                           tmpfile)));
3046                 unlink(tmpfile);
3047         }
3048         else if (rename(tmpfile, statfile) < 0)
3049         {
3050                 ereport(LOG,
3051                                 (errcode_for_file_access(),
3052                                  errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
3053                                                 tmpfile, statfile)));
3054                 unlink(tmpfile);
3055         }
3056         else
3057         {
3058                 /*
3059                  * Successful write, so update last_statwrite.
3060                  */
3061                 last_statwrite = globalStats.stats_timestamp;
3062
3063                 /*
3064                  * It's not entirely clear whether there could be clock skew between
3065                  * backends and the collector; but just in case someone manages to
3066                  * send us a stats request time that's far in the future, reset it.
3067                  * This ensures that no inquiry message can cause more than one stats
3068                  * file write to occur.
3069                  */
3070                 last_statrequest = last_statwrite;
3071         }
3072
3073         if (permanent)
3074                 unlink(pgstat_stat_filename);
3075 }
3076
3077
3078 /* ----------
3079  * pgstat_read_statsfile() -
3080  *
3081  *      Reads in an existing statistics collector file and initializes the
3082  *      databases' hash table (whose entries point to the tables' hash tables).
3083  * ----------
3084  */
3085 static HTAB *
3086 pgstat_read_statsfile(Oid onlydb, bool permanent)
3087 {
3088         PgStat_StatDBEntry *dbentry;
3089         PgStat_StatDBEntry dbbuf;
3090         PgStat_StatTabEntry *tabentry;
3091         PgStat_StatTabEntry tabbuf;
3092         PgStat_StatFuncEntry funcbuf;
3093         PgStat_StatFuncEntry *funcentry;
3094         HASHCTL         hash_ctl;
3095         HTAB       *dbhash;
3096         HTAB       *tabhash = NULL;
3097         HTAB       *funchash = NULL;
3098         FILE       *fpin;
3099         int32           format_id;
3100         bool            found;
3101         const char *statfile = permanent?PGSTAT_STAT_PERMANENT_FILENAME:pgstat_stat_filename;
3102
3103         /*
3104          * The tables will live in pgStatLocalContext.
3105          */
3106         pgstat_setup_memcxt();
3107
3108         /*
3109          * Create the DB hashtable
3110          */
3111         memset(&hash_ctl, 0, sizeof(hash_ctl));
3112         hash_ctl.keysize = sizeof(Oid);
3113         hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
3114         hash_ctl.hash = oid_hash;
3115         hash_ctl.hcxt = pgStatLocalContext;
3116         dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
3117                                                  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
3118
3119         /*
3120          * Clear out global statistics so they start from zero in case we can't
3121          * load an existing statsfile.
3122          */
3123         memset(&globalStats, 0, sizeof(globalStats));
3124
3125         /*
3126          * Try to open the status file. If it doesn't exist, the backends simply
3127          * return zero for anything and the collector simply starts from scratch
3128          * with empty counters.
3129          */
3130         if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
3131                 return dbhash;
3132
3133         /*
3134          * Verify it's of the expected format.
3135          */
3136         if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id)
3137                 || format_id != PGSTAT_FILE_FORMAT_ID)
3138         {
3139                 ereport(pgStatRunningInCollector ? LOG : WARNING,
3140                                 (errmsg("corrupted pgstat.stat file")));
3141                 goto done;
3142         }
3143
3144         /*
3145          * Read global stats struct
3146          */
3147         if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
3148         {
3149                 ereport(pgStatRunningInCollector ? LOG : WARNING,
3150                                 (errmsg("corrupted pgstat.stat file")));
3151                 goto done;
3152         }
3153
3154         /*
3155          * We found an existing collector stats file. Read it and put all the
3156          * hashtable entries into place.
3157          */
3158         for (;;)
3159         {
3160                 switch (fgetc(fpin))
3161                 {
3162                                 /*
3163                                  * 'D'  A PgStat_StatDBEntry struct describing a database
3164                                  * follows. Subsequently, zero to many 'T' and 'F' entries
3165                                  * will follow until a 'd' is encountered.
3166                                  */
3167                         case 'D':
3168                                 if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables),
3169                                                   fpin) != offsetof(PgStat_StatDBEntry, tables))
3170                                 {
3171                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
3172                                                         (errmsg("corrupted pgstat.stat file")));
3173                                         goto done;
3174                                 }
3175
3176                                 /*
3177                                  * Add to the DB hash
3178                                  */
3179                                 dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
3180                                                                                                   (void *) &dbbuf.databaseid,
3181                                                                                                                          HASH_ENTER,
3182                                                                                                                          &found);
3183                                 if (found)
3184                                 {
3185                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
3186                                                         (errmsg("corrupted pgstat.stat file")));
3187                                         goto done;
3188                                 }
3189
3190                                 memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
3191                                 dbentry->tables = NULL;
3192                                 dbentry->functions = NULL;
3193
3194                                 /*
3195                                  * Don't collect tables if not the requested DB (or the
3196                                  * shared-table info)
3197                                  */
3198                                 if (onlydb != InvalidOid)
3199                                 {
3200                                         if (dbbuf.databaseid != onlydb &&
3201                                                 dbbuf.databaseid != InvalidOid)
3202                                                 break;
3203                                 }
3204
3205                                 memset(&hash_ctl, 0, sizeof(hash_ctl));
3206                                 hash_ctl.keysize = sizeof(Oid);
3207                                 hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
3208                                 hash_ctl.hash = oid_hash;
3209                                 hash_ctl.hcxt = pgStatLocalContext;
3210                                 dbentry->tables = hash_create("Per-database table",
3211                                                                                           PGSTAT_TAB_HASH_SIZE,
3212                                                                                           &hash_ctl,
3213                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
3214
3215                                 hash_ctl.keysize = sizeof(Oid);
3216                                 hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
3217                                 hash_ctl.hash = oid_hash;
3218                                 hash_ctl.hcxt = pgStatLocalContext;
3219                                 dbentry->functions = hash_create("Per-database function",
3220                                                                                                  PGSTAT_FUNCTION_HASH_SIZE,
3221                                                                                                  &hash_ctl,
3222                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
3223                                 /*
3224                                  * Arrange that following records add entries to this
3225                                  * database's hash tables.
3226                                  */
3227                                 tabhash = dbentry->tables;
3228                                 funchash = dbentry->functions;
3229                                 break;
3230
3231                                 /*
3232                                  * 'd'  End of this database.
3233                                  */
3234                         case 'd':
3235                                 tabhash = NULL;
3236                                 funchash = NULL;
3237                                 break;
3238
3239                                 /*
3240                                  * 'T'  A PgStat_StatTabEntry follows.
3241                                  */
3242                         case 'T':
3243                                 if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry),
3244                                                   fpin) != sizeof(PgStat_StatTabEntry))
3245                                 {
3246                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
3247                                                         (errmsg("corrupted pgstat.stat file")));
3248                                         goto done;
3249                                 }
3250
3251                                 /*
3252                                  * Skip if table belongs to a not requested database.
3253                                  */
3254                                 if (tabhash == NULL)
3255                                         break;
3256
3257                                 tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
3258                                                                                                         (void *) &tabbuf.tableid,
3259                                                                                                                  HASH_ENTER, &found);
3260
3261                                 if (found)
3262                                 {
3263                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
3264                                                         (errmsg("corrupted pgstat.stat file")));
3265                                         goto done;
3266                                 }
3267
3268                                 memcpy(tabentry, &tabbuf, sizeof(tabbuf));
3269                                 break;
3270
3271                                 /*
3272                                  * 'F'  A PgStat_StatFuncEntry follows.
3273                                  */
3274                         case 'F':
3275                                 if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry),
3276                                                   fpin) != sizeof(PgStat_StatFuncEntry))
3277                                 {
3278                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
3279                                                         (errmsg("corrupted pgstat.stat file")));
3280                                         goto done;
3281                                 }
3282
3283                                 /*
3284                                  * Skip if function belongs to a not requested database.
3285                                  */
3286                                 if (funchash == NULL)
3287                                         break;
3288
3289                                 funcentry = (PgStat_StatFuncEntry *) hash_search(funchash,
3290                                                                                                         (void *) &funcbuf.functionid,
3291                                                                                                                  HASH_ENTER, &found);
3292
3293                                 if (found)
3294                                 {
3295                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
3296                                                         (errmsg("corrupted pgstat.stat file")));
3297                                         goto done;
3298                                 }
3299
3300                                 memcpy(funcentry, &funcbuf, sizeof(funcbuf));
3301                                 break;
3302
3303                                 /*
3304                                  * 'E'  The EOF marker of a complete stats file.
3305                                  */
3306                         case 'E':
3307                                 goto done;
3308
3309                         default:
3310                                 ereport(pgStatRunningInCollector ? LOG : WARNING,
3311                                                 (errmsg("corrupted pgstat.stat file")));
3312                                 goto done;
3313                 }
3314         }
3315
3316 done:
3317         FreeFile(fpin);
3318
3319         if (permanent)
3320                 unlink(PGSTAT_STAT_PERMANENT_FILENAME);
3321
3322         return dbhash;
3323 }
3324
3325 /* ----------
3326  * pgstat_read_statsfile_timestamp() -
3327  *
3328  *      Attempt to fetch the timestamp of an existing stats file.
3329  *      Returns TRUE if successful (timestamp is stored at *ts).
3330  * ----------
3331  */
3332 static bool
3333 pgstat_read_statsfile_timestamp(bool permanent, TimestampTz *ts)
3334 {
3335         PgStat_GlobalStats myGlobalStats;
3336         FILE       *fpin;
3337         int32           format_id;
3338         const char *statfile = permanent?PGSTAT_STAT_PERMANENT_FILENAME:pgstat_stat_filename;
3339
3340         /*
3341          * Try to open the status file.
3342          */
3343         if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
3344                 return false;
3345
3346         /*
3347          * Verify it's of the expected format.
3348          */
3349         if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id)
3350                 || format_id != PGSTAT_FILE_FORMAT_ID)
3351         {
3352                 FreeFile(fpin);
3353                 return false;
3354         }
3355
3356         /*
3357          * Read global stats struct
3358          */
3359         if (fread(&myGlobalStats, 1, sizeof(myGlobalStats), fpin) != sizeof(myGlobalStats))
3360         {
3361                 FreeFile(fpin);
3362                 return false;
3363         }
3364
3365         *ts = myGlobalStats.stats_timestamp;
3366
3367         FreeFile(fpin);
3368         return true;
3369 }
3370
3371 /*
3372  * If not already done, read the statistics collector stats file into
3373  * some hash tables.  The results will be kept until pgstat_clear_snapshot()
3374  * is called (typically, at end of transaction).
3375  */
3376 static void
3377 backend_read_statsfile(void)
3378 {
3379         TimestampTz min_ts;
3380         int                     count;
3381
3382         /* already read it? */
3383         if (pgStatDBHash)
3384                 return;
3385         Assert(!pgStatRunningInCollector);
3386
3387         /*
3388          * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL msec
3389          * before now.  This indirectly ensures that the collector needn't write
3390          * the file more often than PGSTAT_STAT_INTERVAL.  In an autovacuum
3391          * worker, however, we want a lower delay to avoid using stale data, so we
3392          * use PGSTAT_RETRY_DELAY (since the number of worker is low, this
3393          * shouldn't be a problem).
3394          *
3395          * Note that we don't recompute min_ts after sleeping; so we might end up
3396          * accepting a file a bit older than PGSTAT_STAT_INTERVAL.  In practice
3397          * that shouldn't happen, though, as long as the sleep time is less than
3398          * PGSTAT_STAT_INTERVAL; and we don't want to lie to the collector about
3399          * what our cutoff time really is.
3400          */
3401         if (IsAutoVacuumWorkerProcess())
3402                 min_ts = TimestampTzPlusMilliseconds(GetCurrentTimestamp(),
3403                                                                                          -PGSTAT_RETRY_DELAY);
3404         else
3405                 min_ts = TimestampTzPlusMilliseconds(GetCurrentTimestamp(),
3406                                                                                          -PGSTAT_STAT_INTERVAL);
3407
3408         /*
3409          * Loop until fresh enough stats file is available or we ran out of time.
3410          * The stats inquiry message is sent repeatedly in case collector drops it.
3411          */
3412         for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++)
3413         {
3414                 TimestampTz file_ts = 0;
3415
3416                 CHECK_FOR_INTERRUPTS();
3417
3418                 if (pgstat_read_statsfile_timestamp(false, &file_ts) &&
3419                         file_ts >= min_ts)
3420                         break;
3421
3422                 /* Not there or too old, so kick the collector and wait a bit */
3423                 pgstat_send_inquiry(min_ts);
3424                 pg_usleep(PGSTAT_RETRY_DELAY * 1000L);
3425         }
3426
3427         if (count >= PGSTAT_POLL_LOOP_COUNT)
3428                 elog(WARNING, "pgstat wait timeout");
3429
3430         /* Autovacuum launcher wants stats about all databases */
3431         if (IsAutoVacuumLauncherProcess())
3432                 pgStatDBHash = pgstat_read_statsfile(InvalidOid, false);
3433         else
3434                 pgStatDBHash = pgstat_read_statsfile(MyDatabaseId, false);
3435 }
3436
3437
3438 /* ----------
3439  * pgstat_setup_memcxt() -
3440  *
3441  *      Create pgStatLocalContext, if not already done.
3442  * ----------
3443  */
3444 static void
3445 pgstat_setup_memcxt(void)
3446 {
3447         if (!pgStatLocalContext)
3448                 pgStatLocalContext = AllocSetContextCreate(TopMemoryContext,
3449                                                                                                    "Statistics snapshot",
3450                                                                                                    ALLOCSET_SMALL_MINSIZE,
3451                                                                                                    ALLOCSET_SMALL_INITSIZE,
3452                                                                                                    ALLOCSET_SMALL_MAXSIZE);
3453 }
3454
3455
3456 /* ----------
3457  * pgstat_clear_snapshot() -
3458  *
3459  *      Discard any data collected in the current transaction.  Any subsequent
3460  *      request will cause new snapshots to be read.
3461  *
3462  *      This is also invoked during transaction commit or abort to discard
3463  *      the no-longer-wanted snapshot.
3464  * ----------
3465  */
3466 void
3467 pgstat_clear_snapshot(void)
3468 {
3469         /* Release memory, if any was allocated */
3470         if (pgStatLocalContext)
3471                 MemoryContextDelete(pgStatLocalContext);
3472
3473         /* Reset variables */
3474         pgStatLocalContext = NULL;
3475         pgStatDBHash = NULL;
3476         localBackendStatusTable = NULL;
3477         localNumBackends = 0;
3478 }
3479
3480
3481 /* ----------
3482  * pgstat_recv_inquiry() -
3483  *
3484  *      Process stat inquiry requests.
3485  * ----------
3486  */
3487 static void
3488 pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
3489 {
3490         if (msg->inquiry_time > last_statrequest)
3491                 last_statrequest = msg->inquiry_time;
3492 }
3493
3494
3495 /* ----------
3496  * pgstat_recv_tabstat() -
3497  *
3498  *      Count what the backend has done.
3499  * ----------
3500  */
3501 static void
3502 pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
3503 {
3504         PgStat_TableEntry *tabmsg = &(msg->m_entry[0]);
3505         PgStat_StatDBEntry *dbentry;
3506         PgStat_StatTabEntry *tabentry;
3507         int                     i;
3508         bool            found;
3509
3510         dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
3511
3512         /*
3513          * Update database-wide stats.
3514          */
3515         dbentry->n_xact_commit += (PgStat_Counter) (msg->m_xact_commit);
3516         dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback);
3517
3518         /*
3519          * Process all table entries in the message.
3520          */
3521         for (i = 0; i < msg->m_nentries; i++)
3522         {
3523                 tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
3524                                                                                                   (void *) &(tabmsg[i].t_id),
3525                                                                                                            HASH_ENTER, &found);
3526
3527                 if (!found)
3528                 {
3529                         /*
3530                          * If it's a new table entry, initialize counters to the values we
3531                          * just got.
3532                          */
3533                         tabentry->numscans = tabmsg[i].t_counts.t_numscans;
3534                         tabentry->tuples_returned = tabmsg[i].t_counts.t_tuples_returned;
3535                         tabentry->tuples_fetched = tabmsg[i].t_counts.t_tuples_fetched;
3536                         tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted;
3537                         tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated;
3538                         tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted;
3539                         tabentry->tuples_hot_updated = tabmsg[i].t_counts.t_tuples_hot_updated;
3540                         tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples;
3541                         tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples;
3542                         tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched;
3543                         tabentry->blocks_hit = tabmsg[i].t_counts.t_blocks_hit;
3544
3545                         tabentry->last_anl_tuples = 0;
3546                         tabentry->vacuum_timestamp = 0;
3547                         tabentry->autovac_vacuum_timestamp = 0;
3548                         tabentry->analyze_timestamp = 0;
3549                         tabentry->autovac_analyze_timestamp = 0;
3550                 }
3551                 else
3552                 {
3553                         /*
3554                          * Otherwise add the values to the existing entry.
3555                          */
3556                         tabentry->numscans += tabmsg[i].t_counts.t_numscans;
3557                         tabentry->tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
3558                         tabentry->tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
3559                         tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
3560                         tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
3561                         tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
3562                         tabentry->tuples_hot_updated += tabmsg[i].t_counts.t_tuples_hot_updated;
3563                         tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples;
3564                         tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples;
3565                         tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
3566                         tabentry->blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
3567                 }
3568
3569                 /* Clamp n_live_tuples in case of negative new_live_tuples */
3570                 tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
3571                 /* Likewise for n_dead_tuples */
3572                 tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
3573
3574                 /*
3575                  * Add per-table stats to the per-database entry, too.
3576                  */
3577                 dbentry->n_tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
3578                 dbentry->n_tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
3579                 dbentry->n_tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
3580                 dbentry->n_tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
3581                 dbentry->n_tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
3582                 dbentry->n_blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
3583                 dbentry->n_blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
3584         }
3585 }
3586
3587
3588 /* ----------
3589  * pgstat_recv_tabpurge() -
3590  *
3591  *      Arrange for dead table removal.
3592  * ----------
3593  */
3594 static void
3595 pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
3596 {
3597         PgStat_StatDBEntry *dbentry;
3598         int                     i;
3599
3600         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3601
3602         /*
3603          * No need to purge if we don't even know the database.
3604          */
3605         if (!dbentry || !dbentry->tables)
3606                 return;
3607
3608         /*
3609          * Process all table entries in the message.
3610          */
3611         for (i = 0; i < msg->m_nentries; i++)
3612         {
3613                 /* Remove from hashtable if present; we don't care if it's not. */
3614                 (void) hash_search(dbentry->tables,
3615                                                    (void *) &(msg->m_tableid[i]),
3616                                                    HASH_REMOVE, NULL);
3617         }
3618 }
3619
3620
3621 /* ----------
3622  * pgstat_recv_dropdb() -
3623  *
3624  *      Arrange for dead database removal
3625  * ----------
3626  */
3627 static void
3628 pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
3629 {
3630         PgStat_StatDBEntry *dbentry;
3631
3632         /*
3633          * Lookup the database in the hashtable.
3634          */
3635         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3636
3637         /*
3638          * If found, remove it.
3639          */
3640         if (dbentry)
3641         {
3642                 if (dbentry->tables != NULL)
3643                         hash_destroy(dbentry->tables);
3644                 if (dbentry->functions != NULL)
3645                         hash_destroy(dbentry->functions);
3646
3647                 if (hash_search(pgStatDBHash,
3648                                                 (void *) &(dbentry->databaseid),
3649                                                 HASH_REMOVE, NULL) == NULL)
3650                         ereport(ERROR,
3651                                         (errmsg("database hash table corrupted "
3652                                                         "during cleanup --- abort")));
3653         }
3654 }
3655
3656
3657 /* ----------
3658  * pgstat_recv_resetcounter() -
3659  *
3660  *      Reset the statistics for the specified database.
3661  * ----------
3662  */
3663 static void
3664 pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
3665 {
3666         HASHCTL         hash_ctl;
3667         PgStat_StatDBEntry *dbentry;
3668
3669         /*
3670          * Lookup the database in the hashtable.  Nothing to do if not there.
3671          */
3672         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3673
3674         if (!dbentry)
3675                 return;
3676
3677         /*
3678          * We simply throw away all the database's table entries by recreating a
3679          * new hash table for them.
3680          */
3681         if (dbentry->tables != NULL)
3682                 hash_destroy(dbentry->tables);
3683         if (dbentry->functions != NULL)
3684                 hash_destroy(dbentry->functions);
3685
3686         dbentry->tables = NULL;
3687         dbentry->functions = NULL;
3688         dbentry->n_xact_commit = 0;
3689         dbentry->n_xact_rollback = 0;
3690         dbentry->n_blocks_fetched = 0;
3691         dbentry->n_blocks_hit = 0;
3692
3693         memset(&hash_ctl, 0, sizeof(hash_ctl));
3694         hash_ctl.keysize = sizeof(Oid);
3695         hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
3696         hash_ctl.hash = oid_hash;
3697         dbentry->tables = hash_create("Per-database table",
3698                                                                   PGSTAT_TAB_HASH_SIZE,
3699                                                                   &hash_ctl,
3700                                                                   HASH_ELEM | HASH_FUNCTION);
3701
3702         hash_ctl.keysize = sizeof(Oid);
3703         hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
3704         hash_ctl.hash = oid_hash;
3705         dbentry->functions = hash_create("Per-database function",
3706                                                                          PGSTAT_FUNCTION_HASH_SIZE,
3707                                                                          &hash_ctl,
3708                                                                          HASH_ELEM | HASH_FUNCTION);
3709 }
3710
3711 /* ----------
3712  * pgstat_recv_autovac() -
3713  *
3714  *      Process an autovacuum signalling message.
3715  * ----------
3716  */
3717 static void
3718 pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
3719 {
3720         PgStat_StatDBEntry *dbentry;
3721
3722         /*
3723          * Lookup the database in the hashtable.  Don't create the entry if it
3724          * doesn't exist, because autovacuum may be processing a template
3725          * database.  If this isn't the case, the database is most likely to have
3726          * an entry already.  (If it doesn't, not much harm is done anyway --
3727          * it'll get created as soon as somebody actually uses the database.)
3728          */
3729         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3730         if (dbentry == NULL)
3731                 return;
3732
3733         /*
3734          * Store the last autovacuum time in the database entry.
3735          */
3736         dbentry->last_autovac_time = msg->m_start_time;
3737 }
3738
3739 /* ----------
3740  * pgstat_recv_vacuum() -
3741  *
3742  *      Process a VACUUM message.
3743  * ----------
3744  */
3745 static void
3746 pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
3747 {
3748         PgStat_StatDBEntry *dbentry;
3749         PgStat_StatTabEntry *tabentry;
3750
3751         /*
3752          * Don't create either the database or table entry if it doesn't already
3753          * exist.  This avoids bloating the stats with entries for stuff that is
3754          * only touched by vacuum and not by live operations.
3755          */
3756         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3757         if (dbentry == NULL)
3758                 return;
3759
3760         tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
3761                                                    HASH_FIND, NULL);
3762         if (tabentry == NULL)
3763                 return;
3764
3765         if (msg->m_autovacuum)
3766                 tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime;
3767         else
3768                 tabentry->vacuum_timestamp = msg->m_vacuumtime;
3769         if (msg->m_scanned_all)
3770                 tabentry->n_live_tuples = msg->m_tuples;
3771         /* Resetting dead_tuples to 0 is an approximation ... */
3772         tabentry->n_dead_tuples = 0;
3773         if (msg->m_analyze)
3774         {
3775                 if (msg->m_scanned_all)
3776                         tabentry->last_anl_tuples = msg->m_tuples;
3777                 if (msg->m_autovacuum)
3778                         tabentry->autovac_analyze_timestamp = msg->m_vacuumtime;
3779                 else
3780                         tabentry->analyze_timestamp = msg->m_vacuumtime;
3781         }
3782         else
3783         {
3784                 /* last_anl_tuples must never exceed n_live_tuples+n_dead_tuples */
3785                 tabentry->last_anl_tuples = Min(tabentry->last_anl_tuples,
3786                                                                                 tabentry->n_live_tuples);
3787         }
3788 }
3789
3790 /* ----------
3791  * pgstat_recv_analyze() -
3792  *
3793  *      Process an ANALYZE message.
3794  * ----------
3795  */
3796 static void
3797 pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
3798 {
3799         PgStat_StatDBEntry *dbentry;
3800         PgStat_StatTabEntry *tabentry;
3801
3802         /*
3803          * Don't create either the database or table entry if it doesn't already
3804          * exist.  This avoids bloating the stats with entries for stuff that is
3805          * only touched by analyze and not by live operations.
3806          */
3807         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3808         if (dbentry == NULL)
3809                 return;
3810
3811         tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
3812                                                    HASH_FIND, NULL);
3813         if (tabentry == NULL)
3814                 return;
3815
3816         if (msg->m_autovacuum)
3817                 tabentry->autovac_analyze_timestamp = msg->m_analyzetime;
3818         else
3819                 tabentry->analyze_timestamp = msg->m_analyzetime;
3820         tabentry->n_live_tuples = msg->m_live_tuples;
3821         tabentry->n_dead_tuples = msg->m_dead_tuples;
3822         tabentry->last_anl_tuples = msg->m_live_tuples + msg->m_dead_tuples;
3823 }
3824
3825
3826 /* ----------
3827  * pgstat_recv_bgwriter() -
3828  *
3829  *      Process a BGWRITER message.
3830  * ----------
3831  */
3832 static void
3833 pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
3834 {
3835         globalStats.timed_checkpoints += msg->m_timed_checkpoints;
3836         globalStats.requested_checkpoints += msg->m_requested_checkpoints;
3837         globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
3838         globalStats.buf_written_clean += msg->m_buf_written_clean;
3839         globalStats.maxwritten_clean += msg->m_maxwritten_clean;
3840         globalStats.buf_written_backend += msg->m_buf_written_backend;
3841         globalStats.buf_alloc += msg->m_buf_alloc;
3842 }
3843
3844 /* ----------
3845  * pgstat_recv_funcstat() -
3846  *
3847  *      Count what the backend has done.
3848  * ----------
3849  */
3850 static void
3851 pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
3852 {
3853         PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]);
3854         PgStat_StatDBEntry *dbentry;
3855         PgStat_StatFuncEntry *funcentry;
3856         int                     i;
3857         bool            found;
3858
3859         dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
3860
3861         /*
3862          * Process all function entries in the message.
3863          */
3864         for (i = 0; i < msg->m_nentries; i++, funcmsg++)
3865         {
3866                 funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
3867                                                                                                   (void *) &(funcmsg->f_id),
3868                                                                                                            HASH_ENTER, &found);
3869
3870                 if (!found)
3871                 {
3872                         /*
3873                          * If it's a new function entry, initialize counters to the values
3874                          * we just got.
3875                          */
3876                         funcentry->f_numcalls = funcmsg->f_numcalls;
3877                         funcentry->f_time = funcmsg->f_time;
3878                         funcentry->f_time_self = funcmsg->f_time_self;
3879                 }
3880                 else
3881                 {
3882                         /*
3883                          * Otherwise add the values to the existing entry.
3884                          */
3885                         funcentry->f_numcalls += funcmsg->f_numcalls;
3886                         funcentry->f_time += funcmsg->f_time;
3887                         funcentry->f_time_self += funcmsg->f_time_self;
3888                 }
3889         }
3890 }
3891
3892 /* ----------
3893  * pgstat_recv_funcpurge() -
3894  *
3895  *      Arrange for dead function removal.
3896  * ----------
3897  */
3898 static void
3899 pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len)
3900 {
3901         PgStat_StatDBEntry *dbentry;
3902         int                     i;
3903
3904         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3905
3906         /*
3907          * No need to purge if we don't even know the database.
3908          */
3909         if (!dbentry || !dbentry->functions)
3910                 return;
3911
3912         /*
3913          * Process all function entries in the message.
3914          */
3915         for (i = 0; i < msg->m_nentries; i++)
3916         {
3917                 /* Remove from hashtable if present; we don't care if it's not. */
3918                 (void) hash_search(dbentry->functions,
3919                                                    (void *) &(msg->m_functionid[i]),
3920                                                    HASH_REMOVE, NULL);
3921         }
3922 }