]> granicus.if.org Git - postgresql/blob - src/backend/postmaster/pgstat.c
Unconditionally write the statsfile when SIGHUP is received, to minimize
[postgresql] / src / backend / postmaster / pgstat.c
1 /* ----------
2  * pgstat.c
3  *
4  *      All the statistics collector stuff hacked up in one big, ugly file.
5  *
6  *      TODO:   - Separate collector, postmaster and backend stuff
7  *                        into different files.
8  *
9  *                      - Add some automatic call for pgstat vacuuming.
10  *
11  *                      - Add a pgstat config column to pg_database, so this
12  *                        entire thing can be enabled/disabled on a per db basis.
13  *
14  *      Copyright (c) 2001-2008, PostgreSQL Global Development Group
15  *
16  *      $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.181 2008/08/25 18:55:43 mha Exp $
17  * ----------
18  */
19 #include "postgres.h"
20
21 #include <unistd.h>
22 #include <fcntl.h>
23 #include <sys/param.h>
24 #include <sys/time.h>
25 #include <sys/socket.h>
26 #include <netdb.h>
27 #include <netinet/in.h>
28 #include <arpa/inet.h>
29 #include <signal.h>
30 #include <time.h>
31 #ifdef HAVE_POLL_H
32 #include <poll.h>
33 #endif
34 #ifdef HAVE_SYS_POLL_H
35 #include <sys/poll.h>
36 #endif
37
38 #include "pgstat.h"
39
40 #include "access/heapam.h"
41 #include "access/transam.h"
42 #include "access/twophase_rmgr.h"
43 #include "access/xact.h"
44 #include "catalog/pg_database.h"
45 #include "catalog/pg_proc.h"
46 #include "libpq/ip.h"
47 #include "libpq/libpq.h"
48 #include "libpq/pqsignal.h"
49 #include "mb/pg_wchar.h"
50 #include "miscadmin.h"
51 #include "pg_trace.h"
52 #include "postmaster/autovacuum.h"
53 #include "postmaster/fork_process.h"
54 #include "postmaster/postmaster.h"
55 #include "storage/backendid.h"
56 #include "storage/fd.h"
57 #include "storage/ipc.h"
58 #include "storage/pg_shmem.h"
59 #include "storage/pmsignal.h"
60 #include "utils/guc.h"
61 #include "utils/memutils.h"
62 #include "utils/ps_status.h"
63 #include "utils/rel.h"
64 #include "utils/tqual.h"
65
66
67 /* ----------
68  * Paths for the statistics files (relative to installation's $PGDATA).
69  * ----------
70  */
71 #define PGSTAT_STAT_PERMANENT_FILENAME          "global/pgstat.stat"
72 #define PGSTAT_STAT_PERMANENT_TMPFILE           "global/pgstat.tmp"
73
74 /* ----------
75  * Timer definitions.
76  * ----------
77  */
78 #define PGSTAT_STAT_INTERVAL    500             /* How often to write the status file;
79                                                                                  * in milliseconds. */
80
81 #define PGSTAT_RESTART_INTERVAL 60              /* How often to attempt to restart a
82                                                                                  * failed statistics collector; in
83                                                                                  * seconds. */
84
85 #define PGSTAT_SELECT_TIMEOUT   2               /* How often to check for postmaster
86                                                                                  * death; in seconds. */
87
88
89 /* ----------
90  * The initial size hints for the hash tables used in the collector.
91  * ----------
92  */
93 #define PGSTAT_DB_HASH_SIZE             16
94 #define PGSTAT_TAB_HASH_SIZE    512
95 #define PGSTAT_FUNCTION_HASH_SIZE       512
96
97
98 /* ----------
99  * GUC parameters
100  * ----------
101  */
102 bool            pgstat_track_activities = false;
103 bool            pgstat_track_counts = false;
104 int                     pgstat_track_functions = TRACK_FUNC_OFF;
105 int                     pgstat_track_activity_query_size = 1024;
106
107 /* ----------
108  * Built from GUC parameter
109  * ----------
110  */
111 char       *pgstat_stat_filename = NULL;
112 char       *pgstat_stat_tmpname = NULL;
113
114 /*
115  * BgWriter global statistics counters (unused in other processes).
116  * Stored directly in a stats message structure so it can be sent
117  * without needing to copy things around.  We assume this inits to zeroes.
118  */
119 PgStat_MsgBgWriter BgWriterStats;
120
121 /* ----------
122  * Local data
123  * ----------
124  */
125 NON_EXEC_STATIC int pgStatSock = -1;
126
127 static struct sockaddr_storage pgStatAddr;
128
129 static time_t last_pgstat_start_time;
130
131 static bool pgStatRunningInCollector = false;
132
133 /*
134  * Structures in which backends store per-table info that's waiting to be
135  * sent to the collector.
136  *
137  * NOTE: once allocated, TabStatusArray structures are never moved or deleted
138  * for the life of the backend.  Also, we zero out the t_id fields of the
139  * contained PgStat_TableStatus structs whenever they are not actively in use.
140  * This allows relcache pgstat_info pointers to be treated as long-lived data,
141  * avoiding repeated searches in pgstat_initstats() when a relation is
142  * repeatedly opened during a transaction.
143  */
144 #define TABSTAT_QUANTUM         100 /* we alloc this many at a time */
145
146 typedef struct TabStatusArray
147 {
148         struct TabStatusArray *tsa_next;        /* link to next array, if any */
149         int                     tsa_used;               /* # entries currently used */
150         PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM];        /* per-table data */
151 } TabStatusArray;
152
153 static TabStatusArray *pgStatTabList = NULL;
154
155 /*
156  * Backends store per-function info that's waiting to be sent to the collector
157  * in this hash table (indexed by function OID).
158  */
159 static HTAB *pgStatFunctions = NULL;
160
161 /*
162  * Tuple insertion/deletion counts for an open transaction can't be propagated
163  * into PgStat_TableStatus counters until we know if it is going to commit
164  * or abort.  Hence, we keep these counts in per-subxact structs that live
165  * in TopTransactionContext.  This data structure is designed on the assumption
166  * that subxacts won't usually modify very many tables.
167  */
168 typedef struct PgStat_SubXactStatus
169 {
170         int                     nest_level;             /* subtransaction nest level */
171         struct PgStat_SubXactStatus *prev;      /* higher-level subxact if any */
172         PgStat_TableXactStatus *first;          /* head of list for this subxact */
173 } PgStat_SubXactStatus;
174
175 static PgStat_SubXactStatus *pgStatXactStack = NULL;
176
177 static int      pgStatXactCommit = 0;
178 static int      pgStatXactRollback = 0;
179
180 /* Record that's written to 2PC state file when pgstat state is persisted */
181 typedef struct TwoPhasePgStatRecord
182 {
183         PgStat_Counter tuples_inserted;         /* tuples inserted in xact */
184         PgStat_Counter tuples_deleted;          /* tuples deleted in xact */
185         Oid                     t_id;                   /* table's OID */
186         bool            t_shared;               /* is it a shared catalog? */
187 } TwoPhasePgStatRecord;
188
189 /*
190  * Info about current "snapshot" of stats file
191  */
192 static MemoryContext pgStatLocalContext = NULL;
193 static HTAB *pgStatDBHash = NULL;
194 static PgBackendStatus *localBackendStatusTable = NULL;
195 static int      localNumBackends = 0;
196
197 /*
198  * Cluster wide statistics, kept in the stats collector.
199  * Contains statistics that are not collected per database
200  * or per table.
201  */
202 static PgStat_GlobalStats globalStats;
203
204 static volatile bool need_exit = false;
205 static volatile bool need_statwrite = false;
206 static volatile bool got_SIGHUP = false;
207
208 /*
209  * Total time charged to functions so far in the current backend.
210  * We use this to help separate "self" and "other" time charges.
211  * (We assume this initializes to zero.)
212  */
213 static instr_time total_func_time;
214
215
216 /* ----------
217  * Local function forward declarations
218  * ----------
219  */
220 #ifdef EXEC_BACKEND
221 static pid_t pgstat_forkexec(void);
222 #endif
223
224 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]);
225 static void pgstat_exit(SIGNAL_ARGS);
226 static void force_statwrite(SIGNAL_ARGS);
227 static void pgstat_beshutdown_hook(int code, Datum arg);
228 static void pgstat_sighup_handler(SIGNAL_ARGS);
229
230 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
231 static void pgstat_write_statsfile(bool permanent);
232 static HTAB *pgstat_read_statsfile(Oid onlydb, bool permanent);
233 static void backend_read_statsfile(void);
234 static void pgstat_read_current_status(void);
235
236 static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
237 static void pgstat_send_funcstats(void);
238 static HTAB *pgstat_collect_oids(Oid catalogid);
239
240 static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
241
242 static void pgstat_setup_memcxt(void);
243
244 static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
245 static void pgstat_send(void *msg, int len);
246
247 static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
248 static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
249 static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
250 static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
251 static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
252 static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
253 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
254 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
255 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
256 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
257
258
259 /* ------------------------------------------------------------
260  * Public functions called from postmaster follow
261  * ------------------------------------------------------------
262  */
263
264 /* ----------
265  * pgstat_init() -
266  *
267  *      Called from postmaster at startup. Create the resources required
268  *      by the statistics collector process.  If unable to do so, do not
269  *      fail --- better to let the postmaster start with stats collection
270  *      disabled.
271  * ----------
272  */
273 void
274 pgstat_init(void)
275 {
276         ACCEPT_TYPE_ARG3 alen;
277         struct addrinfo *addrs = NULL,
278                            *addr,
279                                 hints;
280         int                     ret;
281         fd_set          rset;
282         struct timeval tv;
283         char            test_byte;
284         int                     sel_res;
285         int                     tries = 0;
286
287 #define TESTBYTEVAL ((char) 199)
288
289         /*
290          * Create the UDP socket for sending and receiving statistic messages
291          */
292         hints.ai_flags = AI_PASSIVE;
293         hints.ai_family = PF_UNSPEC;
294         hints.ai_socktype = SOCK_DGRAM;
295         hints.ai_protocol = 0;
296         hints.ai_addrlen = 0;
297         hints.ai_addr = NULL;
298         hints.ai_canonname = NULL;
299         hints.ai_next = NULL;
300         ret = pg_getaddrinfo_all("localhost", NULL, &hints, &addrs);
301         if (ret || !addrs)
302         {
303                 ereport(LOG,
304                                 (errmsg("could not resolve \"localhost\": %s",
305                                                 gai_strerror(ret))));
306                 goto startup_failed;
307         }
308
309         /*
310          * On some platforms, pg_getaddrinfo_all() may return multiple addresses
311          * only one of which will actually work (eg, both IPv6 and IPv4 addresses
312          * when kernel will reject IPv6).  Worse, the failure may occur at the
313          * bind() or perhaps even connect() stage.      So we must loop through the
314          * results till we find a working combination. We will generate LOG
315          * messages, but no error, for bogus combinations.
316          */
317         for (addr = addrs; addr; addr = addr->ai_next)
318         {
319 #ifdef HAVE_UNIX_SOCKETS
320                 /* Ignore AF_UNIX sockets, if any are returned. */
321                 if (addr->ai_family == AF_UNIX)
322                         continue;
323 #endif
324
325                 if (++tries > 1)
326                         ereport(LOG,
327                         (errmsg("trying another address for the statistics collector")));
328
329                 /*
330                  * Create the socket.
331                  */
332                 if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) < 0)
333                 {
334                         ereport(LOG,
335                                         (errcode_for_socket_access(),
336                         errmsg("could not create socket for statistics collector: %m")));
337                         continue;
338                 }
339
340                 /*
341                  * Bind it to a kernel assigned port on localhost and get the assigned
342                  * port via getsockname().
343                  */
344                 if (bind(pgStatSock, addr->ai_addr, addr->ai_addrlen) < 0)
345                 {
346                         ereport(LOG,
347                                         (errcode_for_socket_access(),
348                           errmsg("could not bind socket for statistics collector: %m")));
349                         closesocket(pgStatSock);
350                         pgStatSock = -1;
351                         continue;
352                 }
353
354                 alen = sizeof(pgStatAddr);
355                 if (getsockname(pgStatSock, (struct sockaddr *) & pgStatAddr, &alen) < 0)
356                 {
357                         ereport(LOG,
358                                         (errcode_for_socket_access(),
359                                          errmsg("could not get address of socket for statistics collector: %m")));
360                         closesocket(pgStatSock);
361                         pgStatSock = -1;
362                         continue;
363                 }
364
365                 /*
366                  * Connect the socket to its own address.  This saves a few cycles by
367                  * not having to respecify the target address on every send. This also
368                  * provides a kernel-level check that only packets from this same
369                  * address will be received.
370                  */
371                 if (connect(pgStatSock, (struct sockaddr *) & pgStatAddr, alen) < 0)
372                 {
373                         ereport(LOG,
374                                         (errcode_for_socket_access(),
375                         errmsg("could not connect socket for statistics collector: %m")));
376                         closesocket(pgStatSock);
377                         pgStatSock = -1;
378                         continue;
379                 }
380
381                 /*
382                  * Try to send and receive a one-byte test message on the socket. This
383                  * is to catch situations where the socket can be created but will not
384                  * actually pass data (for instance, because kernel packet filtering
385                  * rules prevent it).
386                  */
387                 test_byte = TESTBYTEVAL;
388
389 retry1:
390                 if (send(pgStatSock, &test_byte, 1, 0) != 1)
391                 {
392                         if (errno == EINTR)
393                                 goto retry1;    /* if interrupted, just retry */
394                         ereport(LOG,
395                                         (errcode_for_socket_access(),
396                                          errmsg("could not send test message on socket for statistics collector: %m")));
397                         closesocket(pgStatSock);
398                         pgStatSock = -1;
399                         continue;
400                 }
401
402                 /*
403                  * There could possibly be a little delay before the message can be
404                  * received.  We arbitrarily allow up to half a second before deciding
405                  * it's broken.
406                  */
407                 for (;;)                                /* need a loop to handle EINTR */
408                 {
409                         FD_ZERO(&rset);
410                         FD_SET(pgStatSock, &rset);
411                         tv.tv_sec = 0;
412                         tv.tv_usec = 500000;
413                         sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv);
414                         if (sel_res >= 0 || errno != EINTR)
415                                 break;
416                 }
417                 if (sel_res < 0)
418                 {
419                         ereport(LOG,
420                                         (errcode_for_socket_access(),
421                                          errmsg("select() failed in statistics collector: %m")));
422                         closesocket(pgStatSock);
423                         pgStatSock = -1;
424                         continue;
425                 }
426                 if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset))
427                 {
428                         /*
429                          * This is the case we actually think is likely, so take pains to
430                          * give a specific message for it.
431                          *
432                          * errno will not be set meaningfully here, so don't use it.
433                          */
434                         ereport(LOG,
435                                         (errcode(ERRCODE_CONNECTION_FAILURE),
436                                          errmsg("test message did not get through on socket for statistics collector")));
437                         closesocket(pgStatSock);
438                         pgStatSock = -1;
439                         continue;
440                 }
441
442                 test_byte++;                    /* just make sure variable is changed */
443
444 retry2:
445                 if (recv(pgStatSock, &test_byte, 1, 0) != 1)
446                 {
447                         if (errno == EINTR)
448                                 goto retry2;    /* if interrupted, just retry */
449                         ereport(LOG,
450                                         (errcode_for_socket_access(),
451                                          errmsg("could not receive test message on socket for statistics collector: %m")));
452                         closesocket(pgStatSock);
453                         pgStatSock = -1;
454                         continue;
455                 }
456
457                 if (test_byte != TESTBYTEVAL)   /* strictly paranoia ... */
458                 {
459                         ereport(LOG,
460                                         (errcode(ERRCODE_INTERNAL_ERROR),
461                                          errmsg("incorrect test message transmission on socket for statistics collector")));
462                         closesocket(pgStatSock);
463                         pgStatSock = -1;
464                         continue;
465                 }
466
467                 /* If we get here, we have a working socket */
468                 break;
469         }
470
471         /* Did we find a working address? */
472         if (!addr || pgStatSock < 0)
473                 goto startup_failed;
474
475         /*
476          * Set the socket to non-blocking IO.  This ensures that if the collector
477          * falls behind, statistics messages will be discarded; backends won't
478          * block waiting to send messages to the collector.
479          */
480         if (!pg_set_noblock(pgStatSock))
481         {
482                 ereport(LOG,
483                                 (errcode_for_socket_access(),
484                                  errmsg("could not set statistics collector socket to nonblocking mode: %m")));
485                 goto startup_failed;
486         }
487
488         pg_freeaddrinfo_all(hints.ai_family, addrs);
489
490         return;
491
492 startup_failed:
493         ereport(LOG,
494           (errmsg("disabling statistics collector for lack of working socket")));
495
496         if (addrs)
497                 pg_freeaddrinfo_all(hints.ai_family, addrs);
498
499         if (pgStatSock >= 0)
500                 closesocket(pgStatSock);
501         pgStatSock = -1;
502
503         /*
504          * Adjust GUC variables to suppress useless activity, and for debugging
505          * purposes (seeing track_counts off is a clue that we failed here). We
506          * use PGC_S_OVERRIDE because there is no point in trying to turn it back
507          * on from postgresql.conf without a restart.
508          */
509         SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
510 }
511
512 /*
513  * pgstat_reset_all() -
514  *
515  * Remove the stats file.  This is currently used only if WAL
516  * recovery is needed after a crash.
517  */
518 void
519 pgstat_reset_all(void)
520 {
521         unlink(pgstat_stat_filename);
522         unlink(PGSTAT_STAT_PERMANENT_FILENAME);
523 }
524
525 #ifdef EXEC_BACKEND
526
527 /*
528  * pgstat_forkexec() -
529  *
530  * Format up the arglist for, then fork and exec, statistics collector process
531  */
532 static pid_t
533 pgstat_forkexec(void)
534 {
535         char       *av[10];
536         int                     ac = 0;
537
538         av[ac++] = "postgres";
539         av[ac++] = "--forkcol";
540         av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
541
542         av[ac] = NULL;
543         Assert(ac < lengthof(av));
544
545         return postmaster_forkexec(ac, av);
546 }
547 #endif   /* EXEC_BACKEND */
548
549
550 /*
551  * pgstat_start() -
552  *
553  *      Called from postmaster at startup or after an existing collector
554  *      died.  Attempt to fire up a fresh statistics collector.
555  *
556  *      Returns PID of child process, or 0 if fail.
557  *
558  *      Note: if fail, we will be called again from the postmaster main loop.
559  */
560 int
561 pgstat_start(void)
562 {
563         time_t          curtime;
564         pid_t           pgStatPid;
565
566         /*
567          * Check that the socket is there, else pgstat_init failed and we can do
568          * nothing useful.
569          */
570         if (pgStatSock < 0)
571                 return 0;
572
573         /*
574          * Do nothing if too soon since last collector start.  This is a safety
575          * valve to protect against continuous respawn attempts if the collector
576          * is dying immediately at launch.      Note that since we will be re-called
577          * from the postmaster main loop, we will get another chance later.
578          */
579         curtime = time(NULL);
580         if ((unsigned int) (curtime - last_pgstat_start_time) <
581                 (unsigned int) PGSTAT_RESTART_INTERVAL)
582                 return 0;
583         last_pgstat_start_time = curtime;
584
585         /*
586          * Okay, fork off the collector.
587          */
588 #ifdef EXEC_BACKEND
589         switch ((pgStatPid = pgstat_forkexec()))
590 #else
591         switch ((pgStatPid = fork_process()))
592 #endif
593         {
594                 case -1:
595                         ereport(LOG,
596                                         (errmsg("could not fork statistics collector: %m")));
597                         return 0;
598
599 #ifndef EXEC_BACKEND
600                 case 0:
601                         /* in postmaster child ... */
602                         /* Close the postmaster's sockets */
603                         ClosePostmasterPorts(false);
604
605                         /* Lose the postmaster's on-exit routines */
606                         on_exit_reset();
607
608                         /* Drop our connection to postmaster's shared memory, as well */
609                         PGSharedMemoryDetach();
610
611                         PgstatCollectorMain(0, NULL);
612                         break;
613 #endif
614
615                 default:
616                         return (int) pgStatPid;
617         }
618
619         /* shouldn't get here */
620         return 0;
621 }
622
623 void
624 allow_immediate_pgstat_restart(void)
625 {
626         last_pgstat_start_time = 0;
627 }
628
629 /* ------------------------------------------------------------
630  * Public functions used by backends follow
631  *------------------------------------------------------------
632  */
633
634
635 /* ----------
636  * pgstat_report_stat() -
637  *
638  *      Called from tcop/postgres.c to send the so far collected per-table
639  *      and function usage statistics to the collector.  Note that this is
640  *      called only when not within a transaction, so it is fair to use
641  *      transaction stop time as an approximation of current time.
642  * ----------
643  */
644 void
645 pgstat_report_stat(bool force)
646 {
647         /* we assume this inits to all zeroes: */
648         static const PgStat_TableCounts all_zeroes;
649         static TimestampTz last_report = 0;
650
651         TimestampTz now;
652         PgStat_MsgTabstat regular_msg;
653         PgStat_MsgTabstat shared_msg;
654         TabStatusArray *tsa;
655         int                     i;
656
657         /* Don't expend a clock check if nothing to do */
658         /* Note: we ignore pending function stats in this test ... OK? */
659         if (pgStatTabList == NULL ||
660                 pgStatTabList->tsa_used == 0)
661                 return;
662
663         /*
664          * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
665          * msec since we last sent one, or the caller wants to force stats out.
666          */
667         now = GetCurrentTransactionStopTimestamp();
668         if (!force &&
669                 !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL))
670                 return;
671         last_report = now;
672
673         /*
674          * Scan through the TabStatusArray struct(s) to find tables that actually
675          * have counts, and build messages to send.  We have to separate shared
676          * relations from regular ones because the databaseid field in the message
677          * header has to depend on that.
678          */
679         regular_msg.m_databaseid = MyDatabaseId;
680         shared_msg.m_databaseid = InvalidOid;
681         regular_msg.m_nentries = 0;
682         shared_msg.m_nentries = 0;
683
684         for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
685         {
686                 for (i = 0; i < tsa->tsa_used; i++)
687                 {
688                         PgStat_TableStatus *entry = &tsa->tsa_entries[i];
689                         PgStat_MsgTabstat *this_msg;
690                         PgStat_TableEntry *this_ent;
691
692                         /* Shouldn't have any pending transaction-dependent counts */
693                         Assert(entry->trans == NULL);
694
695                         /*
696                          * Ignore entries that didn't accumulate any actual counts, such
697                          * as indexes that were opened by the planner but not used.
698                          */
699                         if (memcmp(&entry->t_counts, &all_zeroes,
700                                            sizeof(PgStat_TableCounts)) == 0)
701                                 continue;
702
703                         /*
704                          * OK, insert data into the appropriate message, and send if full.
705                          */
706                         this_msg = entry->t_shared ? &shared_msg : &regular_msg;
707                         this_ent = &this_msg->m_entry[this_msg->m_nentries];
708                         this_ent->t_id = entry->t_id;
709                         memcpy(&this_ent->t_counts, &entry->t_counts,
710                                    sizeof(PgStat_TableCounts));
711                         if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
712                         {
713                                 pgstat_send_tabstat(this_msg);
714                                 this_msg->m_nentries = 0;
715                         }
716                 }
717                 /* zero out TableStatus structs after use */
718                 MemSet(tsa->tsa_entries, 0,
719                            tsa->tsa_used * sizeof(PgStat_TableStatus));
720                 tsa->tsa_used = 0;
721         }
722
723         /*
724          * Send partial messages.  If force is true, make sure that any pending
725          * xact commit/abort gets counted, even if no table stats to send.
726          */
727         if (regular_msg.m_nentries > 0 ||
728                 (force && (pgStatXactCommit > 0 || pgStatXactRollback > 0)))
729                 pgstat_send_tabstat(&regular_msg);
730         if (shared_msg.m_nentries > 0)
731                 pgstat_send_tabstat(&shared_msg);
732
733         /* Now, send function statistics */
734         pgstat_send_funcstats();
735 }
736
737 /*
738  * Subroutine for pgstat_report_stat: finish and send a tabstat message
739  */
740 static void
741 pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg)
742 {
743         int                     n;
744         int                     len;
745
746         /* It's unlikely we'd get here with no socket, but maybe not impossible */
747         if (pgStatSock < 0)
748                 return;
749
750         /*
751          * Report accumulated xact commit/rollback whenever we send a normal
752          * tabstat message
753          */
754         if (OidIsValid(tsmsg->m_databaseid))
755         {
756                 tsmsg->m_xact_commit = pgStatXactCommit;
757                 tsmsg->m_xact_rollback = pgStatXactRollback;
758                 pgStatXactCommit = 0;
759                 pgStatXactRollback = 0;
760         }
761         else
762         {
763                 tsmsg->m_xact_commit = 0;
764                 tsmsg->m_xact_rollback = 0;
765         }
766
767         n = tsmsg->m_nentries;
768         len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
769                 n * sizeof(PgStat_TableEntry);
770
771         pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
772         pgstat_send(tsmsg, len);
773 }
774
775 /*
776  * Subroutine for pgstat_report_stat: populate and send a function stat message
777  */
778 static void
779 pgstat_send_funcstats(void)
780 {
781         /* we assume this inits to all zeroes: */
782         static const PgStat_FunctionCounts all_zeroes;
783
784         PgStat_MsgFuncstat msg;
785         PgStat_BackendFunctionEntry *entry;
786         HASH_SEQ_STATUS fstat;
787
788         if (pgStatFunctions == NULL)
789                 return;
790
791         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FUNCSTAT);
792         msg.m_databaseid = MyDatabaseId;
793         msg.m_nentries = 0;
794
795         hash_seq_init(&fstat, pgStatFunctions);
796         while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL)
797         {
798                 PgStat_FunctionEntry *m_ent;
799
800                 /* Skip it if no counts accumulated since last time */
801                 if (memcmp(&entry->f_counts, &all_zeroes,
802                                    sizeof(PgStat_FunctionCounts)) == 0)
803                         continue;
804
805                 /* need to convert format of time accumulators */
806                 m_ent = &msg.m_entry[msg.m_nentries];
807                 m_ent->f_id = entry->f_id;
808                 m_ent->f_numcalls = entry->f_counts.f_numcalls;
809                 m_ent->f_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_time);
810                 m_ent->f_time_self = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_time_self);
811
812                 if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES)
813                 {
814                         pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
815                                                 msg.m_nentries * sizeof(PgStat_FunctionEntry));
816                         msg.m_nentries = 0;
817                 }
818
819                 /* reset the entry's counts */
820                 MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts));
821         }
822
823         if (msg.m_nentries > 0)
824                 pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
825                                         msg.m_nentries * sizeof(PgStat_FunctionEntry));
826 }
827
828
829 /* ----------
830  * pgstat_vacuum_stat() -
831  *
832  *      Will tell the collector about objects he can get rid of.
833  * ----------
834  */
835 void
836 pgstat_vacuum_stat(void)
837 {
838         HTAB       *htab;
839         PgStat_MsgTabpurge msg;
840         PgStat_MsgFuncpurge f_msg;
841         HASH_SEQ_STATUS hstat;
842         PgStat_StatDBEntry *dbentry;
843         PgStat_StatTabEntry *tabentry;
844         PgStat_StatFuncEntry *funcentry;
845         int                     len;
846
847         if (pgStatSock < 0)
848                 return;
849
850         /*
851          * If not done for this transaction, read the statistics collector stats
852          * file into some hash tables.
853          */
854         backend_read_statsfile();
855
856         /*
857          * Read pg_database and make a list of OIDs of all existing databases
858          */
859         htab = pgstat_collect_oids(DatabaseRelationId);
860
861         /*
862          * Search the database hash table for dead databases and tell the
863          * collector to drop them.
864          */
865         hash_seq_init(&hstat, pgStatDBHash);
866         while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
867         {
868                 Oid                     dbid = dbentry->databaseid;
869
870                 CHECK_FOR_INTERRUPTS();
871
872                 /* the DB entry for shared tables (with InvalidOid) is never dropped */
873                 if (OidIsValid(dbid) &&
874                         hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL)
875                         pgstat_drop_database(dbid);
876         }
877
878         /* Clean up */
879         hash_destroy(htab);
880
881         /*
882          * Lookup our own database entry; if not found, nothing more to do.
883          */
884         dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
885                                                                                                  (void *) &MyDatabaseId,
886                                                                                                  HASH_FIND, NULL);
887         if (dbentry == NULL || dbentry->tables == NULL)
888                 return;
889
890         /*
891          * Similarly to above, make a list of all known relations in this DB.
892          */
893         htab = pgstat_collect_oids(RelationRelationId);
894
895         /*
896          * Initialize our messages table counter to zero
897          */
898         msg.m_nentries = 0;
899
900         /*
901          * Check for all tables listed in stats hashtable if they still exist.
902          */
903         hash_seq_init(&hstat, dbentry->tables);
904         while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
905         {
906                 Oid                     tabid = tabentry->tableid;
907
908                 CHECK_FOR_INTERRUPTS();
909
910                 if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL)
911                         continue;
912
913                 /*
914                  * Not there, so add this table's Oid to the message
915                  */
916                 msg.m_tableid[msg.m_nentries++] = tabid;
917
918                 /*
919                  * If the message is full, send it out and reinitialize to empty
920                  */
921                 if (msg.m_nentries >= PGSTAT_NUM_TABPURGE)
922                 {
923                         len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
924                                 +msg.m_nentries * sizeof(Oid);
925
926                         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
927                         msg.m_databaseid = MyDatabaseId;
928                         pgstat_send(&msg, len);
929
930                         msg.m_nentries = 0;
931                 }
932         }
933
934         /*
935          * Send the rest
936          */
937         if (msg.m_nentries > 0)
938         {
939                 len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
940                         +msg.m_nentries * sizeof(Oid);
941
942                 pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
943                 msg.m_databaseid = MyDatabaseId;
944                 pgstat_send(&msg, len);
945         }
946
947         /* Clean up */
948         hash_destroy(htab);
949
950         /*
951          * Now repeat the above steps for functions.
952          */
953         htab = pgstat_collect_oids(ProcedureRelationId);
954
955         pgstat_setheader(&f_msg.m_hdr, PGSTAT_MTYPE_FUNCPURGE);
956         f_msg.m_databaseid = MyDatabaseId;
957         f_msg.m_nentries = 0;
958
959         hash_seq_init(&hstat, dbentry->functions);
960         while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL)
961         {
962                 Oid                     funcid = funcentry->functionid;
963
964                 CHECK_FOR_INTERRUPTS();
965
966                 if (hash_search(htab, (void *) &funcid, HASH_FIND, NULL) != NULL)
967                         continue;
968
969                 /*
970                  * Not there, so add this function's Oid to the message
971                  */
972                 f_msg.m_functionid[f_msg.m_nentries++] = funcid;
973
974                 /*
975                  * If the message is full, send it out and reinitialize to empty
976                  */
977                 if (f_msg.m_nentries >= PGSTAT_NUM_FUNCPURGE)
978                 {
979                         len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
980                                 +f_msg.m_nentries * sizeof(Oid);
981
982                         pgstat_send(&f_msg, len);
983
984                         f_msg.m_nentries = 0;
985                 }
986         }
987
988         /*
989          * Send the rest
990          */
991         if (f_msg.m_nentries > 0)
992         {
993                 len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
994                         +f_msg.m_nentries * sizeof(Oid);
995
996                 pgstat_send(&f_msg, len);
997         }
998
999         hash_destroy(htab);
1000 }
1001
1002
1003 /* ----------
1004  * pgstat_collect_oids() -
1005  *
1006  *      Collect the OIDs of all objects listed in the specified system catalog
1007  *      into a temporary hash table.  Caller should hash_destroy the result
1008  *      when done with it.
1009  * ----------
1010  */
1011 static HTAB *
1012 pgstat_collect_oids(Oid catalogid)
1013 {
1014         HTAB       *htab;
1015         HASHCTL         hash_ctl;
1016         Relation        rel;
1017         HeapScanDesc scan;
1018         HeapTuple       tup;
1019
1020         memset(&hash_ctl, 0, sizeof(hash_ctl));
1021         hash_ctl.keysize = sizeof(Oid);
1022         hash_ctl.entrysize = sizeof(Oid);
1023         hash_ctl.hash = oid_hash;
1024         htab = hash_create("Temporary table of OIDs",
1025                                            PGSTAT_TAB_HASH_SIZE,
1026                                            &hash_ctl,
1027                                            HASH_ELEM | HASH_FUNCTION);
1028
1029         rel = heap_open(catalogid, AccessShareLock);
1030         scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
1031         while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
1032         {
1033                 Oid                     thisoid = HeapTupleGetOid(tup);
1034
1035                 CHECK_FOR_INTERRUPTS();
1036
1037                 (void) hash_search(htab, (void *) &thisoid, HASH_ENTER, NULL);
1038         }
1039         heap_endscan(scan);
1040         heap_close(rel, AccessShareLock);
1041
1042         return htab;
1043 }
1044
1045
1046 /* ----------
1047  * pgstat_drop_database() -
1048  *
1049  *      Tell the collector that we just dropped a database.
1050  *      (If the message gets lost, we will still clean the dead DB eventually
1051  *      via future invocations of pgstat_vacuum_stat().)
1052  * ----------
1053  */
1054 void
1055 pgstat_drop_database(Oid databaseid)
1056 {
1057         PgStat_MsgDropdb msg;
1058
1059         if (pgStatSock < 0)
1060                 return;
1061
1062         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB);
1063         msg.m_databaseid = databaseid;
1064         pgstat_send(&msg, sizeof(msg));
1065 }
1066
1067
1068 /* ----------
1069  * pgstat_drop_relation() -
1070  *
1071  *      Tell the collector that we just dropped a relation.
1072  *      (If the message gets lost, we will still clean the dead entry eventually
1073  *      via future invocations of pgstat_vacuum_stat().)
1074  *
1075  *      Currently not used for lack of any good place to call it; we rely
1076  *      entirely on pgstat_vacuum_stat() to clean out stats for dead rels.
1077  * ----------
1078  */
1079 #ifdef NOT_USED
1080 void
1081 pgstat_drop_relation(Oid relid)
1082 {
1083         PgStat_MsgTabpurge msg;
1084         int                     len;
1085
1086         if (pgStatSock < 0)
1087                 return;
1088
1089         msg.m_tableid[0] = relid;
1090         msg.m_nentries = 1;
1091
1092         len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) +sizeof(Oid);
1093
1094         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
1095         msg.m_databaseid = MyDatabaseId;
1096         pgstat_send(&msg, len);
1097 }
1098 #endif   /* NOT_USED */
1099
1100
1101 /* ----------
1102  * pgstat_reset_counters() -
1103  *
1104  *      Tell the statistics collector to reset counters for our database.
1105  * ----------
1106  */
1107 void
1108 pgstat_reset_counters(void)
1109 {
1110         PgStat_MsgResetcounter msg;
1111
1112         if (pgStatSock < 0)
1113                 return;
1114
1115         if (!superuser())
1116                 ereport(ERROR,
1117                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
1118                                  errmsg("must be superuser to reset statistics counters")));
1119
1120         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETCOUNTER);
1121         msg.m_databaseid = MyDatabaseId;
1122         pgstat_send(&msg, sizeof(msg));
1123 }
1124
1125
1126 /* ----------
1127  * pgstat_report_autovac() -
1128  *
1129  *      Called from autovacuum.c to report startup of an autovacuum process.
1130  *      We are called before InitPostgres is done, so can't rely on MyDatabaseId;
1131  *      the db OID must be passed in, instead.
1132  * ----------
1133  */
1134 void
1135 pgstat_report_autovac(Oid dboid)
1136 {
1137         PgStat_MsgAutovacStart msg;
1138
1139         if (pgStatSock < 0)
1140                 return;
1141
1142         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
1143         msg.m_databaseid = dboid;
1144         msg.m_start_time = GetCurrentTimestamp();
1145
1146         pgstat_send(&msg, sizeof(msg));
1147 }
1148
1149
1150 /* ---------
1151  * pgstat_report_vacuum() -
1152  *
1153  *      Tell the collector about the table we just vacuumed.
1154  * ---------
1155  */
1156 void
1157 pgstat_report_vacuum(Oid tableoid, bool shared,
1158                                          bool analyze, PgStat_Counter tuples)
1159 {
1160         PgStat_MsgVacuum msg;
1161
1162         if (pgStatSock < 0 || !pgstat_track_counts)
1163                 return;
1164
1165         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
1166         msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
1167         msg.m_tableoid = tableoid;
1168         msg.m_analyze = analyze;
1169         msg.m_autovacuum = IsAutoVacuumWorkerProcess();         /* is this autovacuum? */
1170         msg.m_vacuumtime = GetCurrentTimestamp();
1171         msg.m_tuples = tuples;
1172         pgstat_send(&msg, sizeof(msg));
1173 }
1174
1175 /* --------
1176  * pgstat_report_analyze() -
1177  *
1178  *      Tell the collector about the table we just analyzed.
1179  * --------
1180  */
1181 void
1182 pgstat_report_analyze(Relation rel, PgStat_Counter livetuples,
1183                                           PgStat_Counter deadtuples)
1184 {
1185         PgStat_MsgAnalyze msg;
1186
1187         if (pgStatSock < 0 || !pgstat_track_counts)
1188                 return;
1189
1190         /*
1191          * Unlike VACUUM, ANALYZE might be running inside a transaction that
1192          * has already inserted and/or deleted rows in the target table.
1193          * ANALYZE will have counted such rows as live or dead respectively.
1194          * Because we will report our counts of such rows at transaction end,
1195          * we should subtract off these counts from what we send to the collector
1196          * now, else they'll be double-counted after commit.  (This approach also
1197          * ensures that the collector ends up with the right numbers if we abort
1198          * instead of committing.)
1199          */
1200         if (rel->pgstat_info != NULL)
1201         {
1202                 PgStat_TableXactStatus *trans;
1203
1204                 for (trans = rel->pgstat_info->trans; trans; trans = trans->upper)
1205                 {
1206                         livetuples -= trans->tuples_inserted - trans->tuples_deleted;
1207                         deadtuples -= trans->tuples_deleted;
1208                 }
1209                 /* count stuff inserted by already-aborted subxacts, too */
1210                 deadtuples -= rel->pgstat_info->t_counts.t_new_dead_tuples;
1211                 /* Since ANALYZE's counts are estimates, we could have underflowed */
1212                 livetuples = Max(livetuples, 0);
1213                 deadtuples = Max(deadtuples, 0);
1214         }
1215
1216         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
1217         msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
1218         msg.m_tableoid = RelationGetRelid(rel);
1219         msg.m_autovacuum = IsAutoVacuumWorkerProcess(); /* is this autovacuum? */
1220         msg.m_analyzetime = GetCurrentTimestamp();
1221         msg.m_live_tuples = livetuples;
1222         msg.m_dead_tuples = deadtuples;
1223         pgstat_send(&msg, sizeof(msg));
1224 }
1225
1226
1227 /* ----------
1228  * pgstat_ping() -
1229  *
1230  *      Send some junk data to the collector to increase traffic.
1231  * ----------
1232  */
1233 void
1234 pgstat_ping(void)
1235 {
1236         PgStat_MsgDummy msg;
1237
1238         if (pgStatSock < 0)
1239                 return;
1240
1241         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
1242         pgstat_send(&msg, sizeof(msg));
1243 }
1244
1245 /*
1246  * Initialize function call usage data.
1247  * Called by the executor before invoking a function.
1248  */
1249 void
1250 pgstat_init_function_usage(FunctionCallInfoData *fcinfo,
1251                                                    PgStat_FunctionCallUsage *fcu)
1252 {
1253         PgStat_BackendFunctionEntry *htabent;
1254         bool            found;
1255
1256         if (pgstat_track_functions <= fcinfo->flinfo->fn_stats)
1257         {
1258                 /* stats not wanted */
1259                 fcu->fs = NULL;
1260                 return;
1261         }
1262
1263         if (!pgStatFunctions)
1264         {
1265                 /* First time through - initialize function stat table */
1266                 HASHCTL         hash_ctl;
1267
1268                 memset(&hash_ctl, 0, sizeof(hash_ctl));
1269                 hash_ctl.keysize = sizeof(Oid);
1270                 hash_ctl.entrysize = sizeof(PgStat_BackendFunctionEntry);
1271                 hash_ctl.hash = oid_hash;
1272                 pgStatFunctions = hash_create("Function stat entries",
1273                                                                           PGSTAT_FUNCTION_HASH_SIZE,
1274                                                                           &hash_ctl,
1275                                                                           HASH_ELEM | HASH_FUNCTION);
1276         }
1277
1278         /* Get the stats entry for this function, create if necessary */
1279         htabent = hash_search(pgStatFunctions, &fcinfo->flinfo->fn_oid,
1280                                                   HASH_ENTER, &found);
1281         if (!found)
1282                 MemSet(&htabent->f_counts, 0, sizeof(PgStat_FunctionCounts));
1283
1284         fcu->fs = &htabent->f_counts;
1285
1286         /* save stats for this function, later used to compensate for recursion */
1287         fcu->save_f_time = htabent->f_counts.f_time;
1288
1289         /* save current backend-wide total time */
1290         fcu->save_total = total_func_time;
1291
1292         /* get clock time as of function start */
1293         INSTR_TIME_SET_CURRENT(fcu->f_start);
1294 }
1295
1296 /*
1297  * Calculate function call usage and update stat counters.
1298  * Called by the executor after invoking a function.
1299  *
1300  * In the case of a set-returning function that runs in value-per-call mode,
1301  * we will see multiple pgstat_init_function_usage/pgstat_end_function_usage
1302  * calls for what the user considers a single call of the function.  The
1303  * finalize flag should be TRUE on the last call.
1304  */
1305 void
1306 pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize)
1307 {
1308         PgStat_FunctionCounts *fs = fcu->fs;
1309         instr_time      f_total;
1310         instr_time      f_others;
1311         instr_time      f_self;
1312
1313         /* stats not wanted? */
1314         if (fs == NULL)
1315                 return;
1316
1317         /* total elapsed time in this function call */
1318         INSTR_TIME_SET_CURRENT(f_total);
1319         INSTR_TIME_SUBTRACT(f_total, fcu->f_start);
1320
1321         /* self usage: elapsed minus anything already charged to other calls */
1322         f_others = total_func_time;
1323         INSTR_TIME_SUBTRACT(f_others, fcu->save_total);
1324         f_self = f_total;
1325         INSTR_TIME_SUBTRACT(f_self, f_others);
1326
1327         /* update backend-wide total time */
1328         INSTR_TIME_ADD(total_func_time, f_self);
1329
1330         /*
1331          * Compute the new total f_time as the total elapsed time added to the
1332          * pre-call value of f_time.  This is necessary to avoid double-counting
1333          * any time taken by recursive calls of myself.  (We do not need any
1334          * similar kluge for self time, since that already excludes any
1335          * recursive calls.)
1336          */
1337         INSTR_TIME_ADD(f_total, fcu->save_f_time);
1338
1339         /* update counters in function stats table */
1340         if (finalize)
1341                 fs->f_numcalls++;
1342         fs->f_time = f_total;
1343         INSTR_TIME_ADD(fs->f_time_self, f_self);
1344 }
1345
1346
1347 /* ----------
1348  * pgstat_initstats() -
1349  *
1350  *      Initialize a relcache entry to count access statistics.
1351  *      Called whenever a relation is opened.
1352  *
1353  *      We assume that a relcache entry's pgstat_info field is zeroed by
1354  *      relcache.c when the relcache entry is made; thereafter it is long-lived
1355  *      data.  We can avoid repeated searches of the TabStatus arrays when the
1356  *      same relation is touched repeatedly within a transaction.
1357  * ----------
1358  */
1359 void
1360 pgstat_initstats(Relation rel)
1361 {
1362         Oid                     rel_id = rel->rd_id;
1363         char            relkind = rel->rd_rel->relkind;
1364
1365         /* We only count stats for things that have storage */
1366         if (!(relkind == RELKIND_RELATION ||
1367                   relkind == RELKIND_INDEX ||
1368                   relkind == RELKIND_TOASTVALUE))
1369         {
1370                 rel->pgstat_info = NULL;
1371                 return;
1372         }
1373
1374         if (pgStatSock < 0 || !pgstat_track_counts)
1375         {
1376                 /* We're not counting at all */
1377                 rel->pgstat_info = NULL;
1378                 return;
1379         }
1380
1381         /*
1382          * If we already set up this relation in the current transaction, nothing
1383          * to do.
1384          */
1385         if (rel->pgstat_info != NULL &&
1386                 rel->pgstat_info->t_id == rel_id)
1387                 return;
1388
1389         /* Else find or make the PgStat_TableStatus entry, and update link */
1390         rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
1391 }
1392
1393 /*
1394  * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
1395  */
1396 static PgStat_TableStatus *
1397 get_tabstat_entry(Oid rel_id, bool isshared)
1398 {
1399         PgStat_TableStatus *entry;
1400         TabStatusArray *tsa;
1401         TabStatusArray *prev_tsa;
1402         int                     i;
1403
1404         /*
1405          * Search the already-used tabstat slots for this relation.
1406          */
1407         prev_tsa = NULL;
1408         for (tsa = pgStatTabList; tsa != NULL; prev_tsa = tsa, tsa = tsa->tsa_next)
1409         {
1410                 for (i = 0; i < tsa->tsa_used; i++)
1411                 {
1412                         entry = &tsa->tsa_entries[i];
1413                         if (entry->t_id == rel_id)
1414                                 return entry;
1415                 }
1416
1417                 if (tsa->tsa_used < TABSTAT_QUANTUM)
1418                 {
1419                         /*
1420                          * It must not be present, but we found a free slot instead. Fine,
1421                          * let's use this one.  We assume the entry was already zeroed,
1422                          * either at creation or after last use.
1423                          */
1424                         entry = &tsa->tsa_entries[tsa->tsa_used++];
1425                         entry->t_id = rel_id;
1426                         entry->t_shared = isshared;
1427                         return entry;
1428                 }
1429         }
1430
1431         /*
1432          * We ran out of tabstat slots, so allocate more.  Be sure they're zeroed.
1433          */
1434         tsa = (TabStatusArray *) MemoryContextAllocZero(TopMemoryContext,
1435                                                                                                         sizeof(TabStatusArray));
1436         if (prev_tsa)
1437                 prev_tsa->tsa_next = tsa;
1438         else
1439                 pgStatTabList = tsa;
1440
1441         /*
1442          * Use the first entry of the new TabStatusArray.
1443          */
1444         entry = &tsa->tsa_entries[tsa->tsa_used++];
1445         entry->t_id = rel_id;
1446         entry->t_shared = isshared;
1447         return entry;
1448 }
1449
1450 /*
1451  * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed
1452  */
1453 static PgStat_SubXactStatus *
1454 get_tabstat_stack_level(int nest_level)
1455 {
1456         PgStat_SubXactStatus *xact_state;
1457
1458         xact_state = pgStatXactStack;
1459         if (xact_state == NULL || xact_state->nest_level != nest_level)
1460         {
1461                 xact_state = (PgStat_SubXactStatus *)
1462                         MemoryContextAlloc(TopTransactionContext,
1463                                                            sizeof(PgStat_SubXactStatus));
1464                 xact_state->nest_level = nest_level;
1465                 xact_state->prev = pgStatXactStack;
1466                 xact_state->first = NULL;
1467                 pgStatXactStack = xact_state;
1468         }
1469         return xact_state;
1470 }
1471
1472 /*
1473  * add_tabstat_xact_level - add a new (sub)transaction state record
1474  */
1475 static void
1476 add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
1477 {
1478         PgStat_SubXactStatus *xact_state;
1479         PgStat_TableXactStatus *trans;
1480
1481         /*
1482          * If this is the first rel to be modified at the current nest level, we
1483          * first have to push a transaction stack entry.
1484          */
1485         xact_state = get_tabstat_stack_level(nest_level);
1486
1487         /* Now make a per-table stack entry */
1488         trans = (PgStat_TableXactStatus *)
1489                 MemoryContextAllocZero(TopTransactionContext,
1490                                                            sizeof(PgStat_TableXactStatus));
1491         trans->nest_level = nest_level;
1492         trans->upper = pgstat_info->trans;
1493         trans->parent = pgstat_info;
1494         trans->next = xact_state->first;
1495         xact_state->first = trans;
1496         pgstat_info->trans = trans;
1497 }
1498
1499 /*
1500  * pgstat_count_heap_insert - count a tuple insertion
1501  */
1502 void
1503 pgstat_count_heap_insert(Relation rel)
1504 {
1505         PgStat_TableStatus *pgstat_info = rel->pgstat_info;
1506
1507         if (pgstat_track_counts && pgstat_info != NULL)
1508         {
1509                 int                     nest_level = GetCurrentTransactionNestLevel();
1510
1511                 /* t_tuples_inserted is nontransactional, so just advance it */
1512                 pgstat_info->t_counts.t_tuples_inserted++;
1513
1514                 /* We have to log the transactional effect at the proper level */
1515                 if (pgstat_info->trans == NULL ||
1516                         pgstat_info->trans->nest_level != nest_level)
1517                         add_tabstat_xact_level(pgstat_info, nest_level);
1518
1519                 pgstat_info->trans->tuples_inserted++;
1520         }
1521 }
1522
1523 /*
1524  * pgstat_count_heap_update - count a tuple update
1525  */
1526 void
1527 pgstat_count_heap_update(Relation rel, bool hot)
1528 {
1529         PgStat_TableStatus *pgstat_info = rel->pgstat_info;
1530
1531         if (pgstat_track_counts && pgstat_info != NULL)
1532         {
1533                 int                     nest_level = GetCurrentTransactionNestLevel();
1534
1535                 /* t_tuples_updated is nontransactional, so just advance it */
1536                 pgstat_info->t_counts.t_tuples_updated++;
1537                 /* ditto for the hot_update counter */
1538                 if (hot)
1539                         pgstat_info->t_counts.t_tuples_hot_updated++;
1540
1541                 /* We have to log the transactional effect at the proper level */
1542                 if (pgstat_info->trans == NULL ||
1543                         pgstat_info->trans->nest_level != nest_level)
1544                         add_tabstat_xact_level(pgstat_info, nest_level);
1545
1546                 /* An UPDATE both inserts a new tuple and deletes the old */
1547                 pgstat_info->trans->tuples_inserted++;
1548                 pgstat_info->trans->tuples_deleted++;
1549         }
1550 }
1551
1552 /*
1553  * pgstat_count_heap_delete - count a tuple deletion
1554  */
1555 void
1556 pgstat_count_heap_delete(Relation rel)
1557 {
1558         PgStat_TableStatus *pgstat_info = rel->pgstat_info;
1559
1560         if (pgstat_track_counts && pgstat_info != NULL)
1561         {
1562                 int                     nest_level = GetCurrentTransactionNestLevel();
1563
1564                 /* t_tuples_deleted is nontransactional, so just advance it */
1565                 pgstat_info->t_counts.t_tuples_deleted++;
1566
1567                 /* We have to log the transactional effect at the proper level */
1568                 if (pgstat_info->trans == NULL ||
1569                         pgstat_info->trans->nest_level != nest_level)
1570                         add_tabstat_xact_level(pgstat_info, nest_level);
1571
1572                 pgstat_info->trans->tuples_deleted++;
1573         }
1574 }
1575
1576 /*
1577  * pgstat_update_heap_dead_tuples - update dead-tuples count
1578  *
1579  * The semantics of this are that we are reporting the nontransactional
1580  * recovery of "delta" dead tuples; so t_new_dead_tuples decreases
1581  * rather than increasing, and the change goes straight into the per-table
1582  * counter, not into transactional state.
1583  */
1584 void
1585 pgstat_update_heap_dead_tuples(Relation rel, int delta)
1586 {
1587         PgStat_TableStatus *pgstat_info = rel->pgstat_info;
1588
1589         if (pgstat_track_counts && pgstat_info != NULL)
1590                 pgstat_info->t_counts.t_new_dead_tuples -= delta;
1591 }
1592
1593
1594 /* ----------
1595  * AtEOXact_PgStat
1596  *
1597  *      Called from access/transam/xact.c at top-level transaction commit/abort.
1598  * ----------
1599  */
1600 void
1601 AtEOXact_PgStat(bool isCommit)
1602 {
1603         PgStat_SubXactStatus *xact_state;
1604
1605         /*
1606          * Count transaction commit or abort.  (We use counters, not just bools,
1607          * in case the reporting message isn't sent right away.)
1608          */
1609         if (isCommit)
1610                 pgStatXactCommit++;
1611         else
1612                 pgStatXactRollback++;
1613
1614         /*
1615          * Transfer transactional insert/update counts into the base tabstat
1616          * entries.  We don't bother to free any of the transactional state, since
1617          * it's all in TopTransactionContext and will go away anyway.
1618          */
1619         xact_state = pgStatXactStack;
1620         if (xact_state != NULL)
1621         {
1622                 PgStat_TableXactStatus *trans;
1623
1624                 Assert(xact_state->nest_level == 1);
1625                 Assert(xact_state->prev == NULL);
1626                 for (trans = xact_state->first; trans != NULL; trans = trans->next)
1627                 {
1628                         PgStat_TableStatus *tabstat;
1629
1630                         Assert(trans->nest_level == 1);
1631                         Assert(trans->upper == NULL);
1632                         tabstat = trans->parent;
1633                         Assert(tabstat->trans == trans);
1634                         if (isCommit)
1635                         {
1636                                 tabstat->t_counts.t_new_live_tuples +=
1637                                         trans->tuples_inserted - trans->tuples_deleted;
1638                                 tabstat->t_counts.t_new_dead_tuples += trans->tuples_deleted;
1639                         }
1640                         else
1641                         {
1642                                 /* inserted tuples are dead, deleted tuples are unaffected */
1643                                 tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
1644                         }
1645                         tabstat->trans = NULL;
1646                 }
1647         }
1648         pgStatXactStack = NULL;
1649
1650         /* Make sure any stats snapshot is thrown away */
1651         pgstat_clear_snapshot();
1652 }
1653
1654 /* ----------
1655  * AtEOSubXact_PgStat
1656  *
1657  *      Called from access/transam/xact.c at subtransaction commit/abort.
1658  * ----------
1659  */
1660 void
1661 AtEOSubXact_PgStat(bool isCommit, int nestDepth)
1662 {
1663         PgStat_SubXactStatus *xact_state;
1664
1665         /*
1666          * Transfer transactional insert/update counts into the next higher
1667          * subtransaction state.
1668          */
1669         xact_state = pgStatXactStack;
1670         if (xact_state != NULL &&
1671                 xact_state->nest_level >= nestDepth)
1672         {
1673                 PgStat_TableXactStatus *trans;
1674                 PgStat_TableXactStatus *next_trans;
1675
1676                 /* delink xact_state from stack immediately to simplify reuse case */
1677                 pgStatXactStack = xact_state->prev;
1678
1679                 for (trans = xact_state->first; trans != NULL; trans = next_trans)
1680                 {
1681                         PgStat_TableStatus *tabstat;
1682
1683                         next_trans = trans->next;
1684                         Assert(trans->nest_level == nestDepth);
1685                         tabstat = trans->parent;
1686                         Assert(tabstat->trans == trans);
1687                         if (isCommit)
1688                         {
1689                                 if (trans->upper && trans->upper->nest_level == nestDepth - 1)
1690                                 {
1691                                         trans->upper->tuples_inserted += trans->tuples_inserted;
1692                                         trans->upper->tuples_deleted += trans->tuples_deleted;
1693                                         tabstat->trans = trans->upper;
1694                                         pfree(trans);
1695                                 }
1696                                 else
1697                                 {
1698                                         /*
1699                                          * When there isn't an immediate parent state, we can just
1700                                          * reuse the record instead of going through a
1701                                          * palloc/pfree pushup (this works since it's all in
1702                                          * TopTransactionContext anyway).  We have to re-link it
1703                                          * into the parent level, though, and that might mean
1704                                          * pushing a new entry into the pgStatXactStack.
1705                                          */
1706                                         PgStat_SubXactStatus *upper_xact_state;
1707
1708                                         upper_xact_state = get_tabstat_stack_level(nestDepth - 1);
1709                                         trans->next = upper_xact_state->first;
1710                                         upper_xact_state->first = trans;
1711                                         trans->nest_level = nestDepth - 1;
1712                                 }
1713                         }
1714                         else
1715                         {
1716                                 /*
1717                                  * On abort, inserted tuples are dead (and can be bounced out
1718                                  * to the top-level tabstat), deleted tuples are unaffected
1719                                  */
1720                                 tabstat->t_counts.t_new_dead_tuples += trans->tuples_inserted;
1721                                 tabstat->trans = trans->upper;
1722                                 pfree(trans);
1723                         }
1724                 }
1725                 pfree(xact_state);
1726         }
1727 }
1728
1729
1730 /*
1731  * AtPrepare_PgStat
1732  *              Save the transactional stats state at 2PC transaction prepare.
1733  *
1734  * In this phase we just generate 2PC records for all the pending
1735  * transaction-dependent stats work.
1736  */
1737 void
1738 AtPrepare_PgStat(void)
1739 {
1740         PgStat_SubXactStatus *xact_state;
1741
1742         xact_state = pgStatXactStack;
1743         if (xact_state != NULL)
1744         {
1745                 PgStat_TableXactStatus *trans;
1746
1747                 Assert(xact_state->nest_level == 1);
1748                 Assert(xact_state->prev == NULL);
1749                 for (trans = xact_state->first; trans != NULL; trans = trans->next)
1750                 {
1751                         PgStat_TableStatus *tabstat;
1752                         TwoPhasePgStatRecord record;
1753
1754                         Assert(trans->nest_level == 1);
1755                         Assert(trans->upper == NULL);
1756                         tabstat = trans->parent;
1757                         Assert(tabstat->trans == trans);
1758
1759                         record.tuples_inserted = trans->tuples_inserted;
1760                         record.tuples_deleted = trans->tuples_deleted;
1761                         record.t_id = tabstat->t_id;
1762                         record.t_shared = tabstat->t_shared;
1763
1764                         RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0,
1765                                                                    &record, sizeof(TwoPhasePgStatRecord));
1766                 }
1767         }
1768 }
1769
1770 /*
1771  * PostPrepare_PgStat
1772  *              Clean up after successful PREPARE.
1773  *
1774  * All we need do here is unlink the transaction stats state from the
1775  * nontransactional state.      The nontransactional action counts will be
1776  * reported to the stats collector immediately, while the effects on live
1777  * and dead tuple counts are preserved in the 2PC state file.
1778  *
1779  * Note: AtEOXact_PgStat is not called during PREPARE.
1780  */
1781 void
1782 PostPrepare_PgStat(void)
1783 {
1784         PgStat_SubXactStatus *xact_state;
1785
1786         /*
1787          * We don't bother to free any of the transactional state, since it's all
1788          * in TopTransactionContext and will go away anyway.
1789          */
1790         xact_state = pgStatXactStack;
1791         if (xact_state != NULL)
1792         {
1793                 PgStat_TableXactStatus *trans;
1794
1795                 for (trans = xact_state->first; trans != NULL; trans = trans->next)
1796                 {
1797                         PgStat_TableStatus *tabstat;
1798
1799                         tabstat = trans->parent;
1800                         tabstat->trans = NULL;
1801                 }
1802         }
1803         pgStatXactStack = NULL;
1804
1805         /* Make sure any stats snapshot is thrown away */
1806         pgstat_clear_snapshot();
1807 }
1808
1809 /*
1810  * 2PC processing routine for COMMIT PREPARED case.
1811  *
1812  * Load the saved counts into our local pgstats state.
1813  */
1814 void
1815 pgstat_twophase_postcommit(TransactionId xid, uint16 info,
1816                                                    void *recdata, uint32 len)
1817 {
1818         TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
1819         PgStat_TableStatus *pgstat_info;
1820
1821         /* Find or create a tabstat entry for the rel */
1822         pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
1823
1824         pgstat_info->t_counts.t_new_live_tuples +=
1825                 rec->tuples_inserted - rec->tuples_deleted;
1826         pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_deleted;
1827 }
1828
1829 /*
1830  * 2PC processing routine for ROLLBACK PREPARED case.
1831  *
1832  * Load the saved counts into our local pgstats state, but treat them
1833  * as aborted.
1834  */
1835 void
1836 pgstat_twophase_postabort(TransactionId xid, uint16 info,
1837                                                   void *recdata, uint32 len)
1838 {
1839         TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
1840         PgStat_TableStatus *pgstat_info;
1841
1842         /* Find or create a tabstat entry for the rel */
1843         pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
1844
1845         /* inserted tuples are dead, deleted tuples are no-ops */
1846         pgstat_info->t_counts.t_new_dead_tuples += rec->tuples_inserted;
1847 }
1848
1849
1850 /* ----------
1851  * pgstat_fetch_stat_dbentry() -
1852  *
1853  *      Support function for the SQL-callable pgstat* functions. Returns
1854  *      the collected statistics for one database or NULL. NULL doesn't mean
1855  *      that the database doesn't exist, it is just not yet known by the
1856  *      collector, so the caller is better off to report ZERO instead.
1857  * ----------
1858  */
1859 PgStat_StatDBEntry *
1860 pgstat_fetch_stat_dbentry(Oid dbid)
1861 {
1862         /*
1863          * If not done for this transaction, read the statistics collector stats
1864          * file into some hash tables.
1865          */
1866         backend_read_statsfile();
1867
1868         /*
1869          * Lookup the requested database; return NULL if not found
1870          */
1871         return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
1872                                                                                           (void *) &dbid,
1873                                                                                           HASH_FIND, NULL);
1874 }
1875
1876
1877 /* ----------
1878  * pgstat_fetch_stat_tabentry() -
1879  *
1880  *      Support function for the SQL-callable pgstat* functions. Returns
1881  *      the collected statistics for one table or NULL. NULL doesn't mean
1882  *      that the table doesn't exist, it is just not yet known by the
1883  *      collector, so the caller is better off to report ZERO instead.
1884  * ----------
1885  */
1886 PgStat_StatTabEntry *
1887 pgstat_fetch_stat_tabentry(Oid relid)
1888 {
1889         Oid                     dbid;
1890         PgStat_StatDBEntry *dbentry;
1891         PgStat_StatTabEntry *tabentry;
1892
1893         /*
1894          * If not done for this transaction, read the statistics collector stats
1895          * file into some hash tables.
1896          */
1897         backend_read_statsfile();
1898
1899         /*
1900          * Lookup our database, then look in its table hash table.
1901          */
1902         dbid = MyDatabaseId;
1903         dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
1904                                                                                                  (void *) &dbid,
1905                                                                                                  HASH_FIND, NULL);
1906         if (dbentry != NULL && dbentry->tables != NULL)
1907         {
1908                 tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
1909                                                                                                            (void *) &relid,
1910                                                                                                            HASH_FIND, NULL);
1911                 if (tabentry)
1912                         return tabentry;
1913         }
1914
1915         /*
1916          * If we didn't find it, maybe it's a shared table.
1917          */
1918         dbid = InvalidOid;
1919         dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
1920                                                                                                  (void *) &dbid,
1921                                                                                                  HASH_FIND, NULL);
1922         if (dbentry != NULL && dbentry->tables != NULL)
1923         {
1924                 tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
1925                                                                                                            (void *) &relid,
1926                                                                                                            HASH_FIND, NULL);
1927                 if (tabentry)
1928                         return tabentry;
1929         }
1930
1931         return NULL;
1932 }
1933
1934
1935 /* ----------
1936  * pgstat_fetch_stat_funcentry() -
1937  *
1938  *      Support function for the SQL-callable pgstat* functions. Returns
1939  *      the collected statistics for one function or NULL.
1940  * ----------
1941  */
1942 PgStat_StatFuncEntry *
1943 pgstat_fetch_stat_funcentry(Oid func_id)
1944 {
1945         PgStat_StatDBEntry *dbentry;
1946         PgStat_StatFuncEntry *funcentry = NULL;
1947
1948         /* load the stats file if needed */
1949         backend_read_statsfile();
1950
1951         /* Lookup our database, then find the requested function.  */
1952         dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
1953         if (dbentry != NULL && dbentry->functions != NULL)
1954         {
1955                 funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
1956                                                                                                                  (void *) &func_id,
1957                                                                                                                  HASH_FIND, NULL);
1958         }
1959
1960         return funcentry;
1961 }
1962
1963
1964 /* ----------
1965  * pgstat_fetch_stat_beentry() -
1966  *
1967  *      Support function for the SQL-callable pgstat* functions. Returns
1968  *      our local copy of the current-activity entry for one backend.
1969  *
1970  *      NB: caller is responsible for a check if the user is permitted to see
1971  *      this info (especially the querystring).
1972  * ----------
1973  */
1974 PgBackendStatus *
1975 pgstat_fetch_stat_beentry(int beid)
1976 {
1977         pgstat_read_current_status();
1978
1979         if (beid < 1 || beid > localNumBackends)
1980                 return NULL;
1981
1982         return &localBackendStatusTable[beid - 1];
1983 }
1984
1985
1986 /* ----------
1987  * pgstat_fetch_stat_numbackends() -
1988  *
1989  *      Support function for the SQL-callable pgstat* functions. Returns
1990  *      the maximum current backend id.
1991  * ----------
1992  */
1993 int
1994 pgstat_fetch_stat_numbackends(void)
1995 {
1996         pgstat_read_current_status();
1997
1998         return localNumBackends;
1999 }
2000
2001 /*
2002  * ---------
2003  * pgstat_fetch_global() -
2004  *
2005  *      Support function for the SQL-callable pgstat* functions. Returns
2006  *      a pointer to the global statistics struct.
2007  * ---------
2008  */
2009 PgStat_GlobalStats *
2010 pgstat_fetch_global(void)
2011 {
2012         backend_read_statsfile();
2013
2014         return &globalStats;
2015 }
2016
2017
2018 /* ------------------------------------------------------------
2019  * Functions for management of the shared-memory PgBackendStatus array
2020  * ------------------------------------------------------------
2021  */
2022
2023 static PgBackendStatus *BackendStatusArray = NULL;
2024 static PgBackendStatus *MyBEEntry = NULL;
2025 static char                        *BackendActivityBuffer = NULL;
2026
2027
2028 /*
2029  * Report shared-memory space needed by CreateSharedBackendStatus.
2030  */
2031 Size
2032 BackendStatusShmemSize(void)
2033 {
2034         Size            size;
2035
2036         size = add_size(mul_size(sizeof(PgBackendStatus), MaxBackends),
2037                                         mul_size(pgstat_track_activity_query_size, MaxBackends));
2038         return size;
2039 }
2040
2041 /*
2042  * Initialize the shared status array and activity string buffer during
2043  * postmaster startup.
2044  */
2045 void
2046 CreateSharedBackendStatus(void)
2047 {
2048         Size            size;
2049         bool            found;
2050         int                     i;
2051         char       *buffer;
2052
2053         /* Create or attach to the shared array */
2054         size = mul_size(sizeof(PgBackendStatus), MaxBackends);
2055         BackendStatusArray = (PgBackendStatus *)
2056                 ShmemInitStruct("Backend Status Array", size, &found);
2057
2058         if (!found)
2059         {
2060                 /*
2061                  * We're the first - initialize.
2062                  */
2063                 MemSet(BackendStatusArray, 0, size);
2064         }
2065
2066         /* Create or attach to the shared activity buffer */
2067         size = mul_size(pgstat_track_activity_query_size, MaxBackends);
2068         BackendActivityBuffer = (char*)
2069                 ShmemInitStruct("Backend Activity Buffer", size, &found);
2070
2071         if (!found)
2072         {
2073                 MemSet(BackendActivityBuffer, 0, size);
2074
2075                 /* Initialize st_activity pointers. */
2076                 buffer = BackendActivityBuffer;
2077                 for (i = 0; i < MaxBackends; i++) {
2078                         BackendStatusArray[i].st_activity = buffer;
2079                         buffer += pgstat_track_activity_query_size;
2080                 }
2081         }
2082 }
2083
2084
2085 /* ----------
2086  * pgstat_initialize() -
2087  *
2088  *      Initialize pgstats state, and set up our on-proc-exit hook.
2089  *      Called from InitPostgres.  MyBackendId must be set,
2090  *      but we must not have started any transaction yet (since the
2091  *      exit hook must run after the last transaction exit).
2092  * ----------
2093  */
2094 void
2095 pgstat_initialize(void)
2096 {
2097         /* Initialize MyBEEntry */
2098         Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends);
2099         MyBEEntry = &BackendStatusArray[MyBackendId - 1];
2100
2101         /* Set up a process-exit hook to clean up */
2102         on_shmem_exit(pgstat_beshutdown_hook, 0);
2103 }
2104
2105 /* ----------
2106  * pgstat_bestart() -
2107  *
2108  *      Initialize this backend's entry in the PgBackendStatus array.
2109  *      Called from InitPostgres.  MyDatabaseId and session userid must be set
2110  *      (hence, this cannot be combined with pgstat_initialize).
2111  * ----------
2112  */
2113 void
2114 pgstat_bestart(void)
2115 {
2116         TimestampTz proc_start_timestamp;
2117         Oid                     userid;
2118         SockAddr        clientaddr;
2119         volatile PgBackendStatus *beentry;
2120
2121         /*
2122          * To minimize the time spent modifying the PgBackendStatus entry, fetch
2123          * all the needed data first.
2124          *
2125          * If we have a MyProcPort, use its session start time (for consistency,
2126          * and to save a kernel call).
2127          */
2128         if (MyProcPort)
2129                 proc_start_timestamp = MyProcPort->SessionStartTime;
2130         else
2131                 proc_start_timestamp = GetCurrentTimestamp();
2132         userid = GetSessionUserId();
2133
2134         /*
2135          * We may not have a MyProcPort (eg, if this is the autovacuum process).
2136          * If so, use all-zeroes client address, which is dealt with specially in
2137          * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port.
2138          */
2139         if (MyProcPort)
2140                 memcpy(&clientaddr, &MyProcPort->raddr, sizeof(clientaddr));
2141         else
2142                 MemSet(&clientaddr, 0, sizeof(clientaddr));
2143
2144         /*
2145          * Initialize my status entry, following the protocol of bumping
2146          * st_changecount before and after; and make sure it's even afterwards. We
2147          * use a volatile pointer here to ensure the compiler doesn't try to get
2148          * cute.
2149          */
2150         beentry = MyBEEntry;
2151         do
2152         {
2153                 beentry->st_changecount++;
2154         } while ((beentry->st_changecount & 1) == 0);
2155
2156         beentry->st_procpid = MyProcPid;
2157         beentry->st_proc_start_timestamp = proc_start_timestamp;
2158         beentry->st_activity_start_timestamp = 0;
2159         beentry->st_xact_start_timestamp = 0;
2160         beentry->st_databaseid = MyDatabaseId;
2161         beentry->st_userid = userid;
2162         beentry->st_clientaddr = clientaddr;
2163         beentry->st_waiting = false;
2164         beentry->st_activity[0] = '\0';
2165         /* Also make sure the last byte in the string area is always 0 */
2166         beentry->st_activity[pgstat_track_activity_query_size - 1] = '\0';
2167
2168         beentry->st_changecount++;
2169         Assert((beentry->st_changecount & 1) == 0);
2170 }
2171
2172 /*
2173  * Shut down a single backend's statistics reporting at process exit.
2174  *
2175  * Flush any remaining statistics counts out to the collector.
2176  * Without this, operations triggered during backend exit (such as
2177  * temp table deletions) won't be counted.
2178  *
2179  * Lastly, clear out our entry in the PgBackendStatus array.
2180  */
2181 static void
2182 pgstat_beshutdown_hook(int code, Datum arg)
2183 {
2184         volatile PgBackendStatus *beentry = MyBEEntry;
2185
2186         pgstat_report_stat(true);
2187
2188         /*
2189          * Clear my status entry, following the protocol of bumping st_changecount
2190          * before and after.  We use a volatile pointer here to ensure the
2191          * compiler doesn't try to get cute.
2192          */
2193         beentry->st_changecount++;
2194
2195         beentry->st_procpid = 0;        /* mark invalid */
2196
2197         beentry->st_changecount++;
2198         Assert((beentry->st_changecount & 1) == 0);
2199 }
2200
2201
2202 /* ----------
2203  * pgstat_report_activity() -
2204  *
2205  *      Called from tcop/postgres.c to report what the backend is actually doing
2206  *      (usually "<IDLE>" or the start of the query to be executed).
2207  * ----------
2208  */
2209 void
2210 pgstat_report_activity(const char *cmd_str)
2211 {
2212         volatile PgBackendStatus *beentry = MyBEEntry;
2213         TimestampTz start_timestamp;
2214         int                     len;
2215
2216         TRACE_POSTGRESQL_STATEMENT_STATUS(cmd_str);
2217
2218         if (!pgstat_track_activities || !beentry)
2219                 return;
2220
2221         /*
2222          * To minimize the time spent modifying the entry, fetch all the needed
2223          * data first.
2224          */
2225         start_timestamp = GetCurrentStatementStartTimestamp();
2226
2227         len = strlen(cmd_str);
2228         len = pg_mbcliplen(cmd_str, len, pgstat_track_activity_query_size - 1);
2229
2230         /*
2231          * Update my status entry, following the protocol of bumping
2232          * st_changecount before and after.  We use a volatile pointer here to
2233          * ensure the compiler doesn't try to get cute.
2234          */
2235         beentry->st_changecount++;
2236
2237         beentry->st_activity_start_timestamp = start_timestamp;
2238         memcpy((char *) beentry->st_activity, cmd_str, len);
2239         beentry->st_activity[len] = '\0';
2240
2241         beentry->st_changecount++;
2242         Assert((beentry->st_changecount & 1) == 0);
2243 }
2244
2245 /*
2246  * Report current transaction start timestamp as the specified value.
2247  * Zero means there is no active transaction.
2248  */
2249 void
2250 pgstat_report_xact_timestamp(TimestampTz tstamp)
2251 {
2252         volatile PgBackendStatus *beentry = MyBEEntry;
2253
2254         if (!pgstat_track_activities || !beentry)
2255                 return;
2256
2257         /*
2258          * Update my status entry, following the protocol of bumping
2259          * st_changecount before and after.  We use a volatile pointer here to
2260          * ensure the compiler doesn't try to get cute.
2261          */
2262         beentry->st_changecount++;
2263         beentry->st_xact_start_timestamp = tstamp;
2264         beentry->st_changecount++;
2265         Assert((beentry->st_changecount & 1) == 0);
2266 }
2267
2268 /* ----------
2269  * pgstat_report_waiting() -
2270  *
2271  *      Called from lock manager to report beginning or end of a lock wait.
2272  *
2273  * NB: this *must* be able to survive being called before MyBEEntry has been
2274  * initialized.
2275  * ----------
2276  */
2277 void
2278 pgstat_report_waiting(bool waiting)
2279 {
2280         volatile PgBackendStatus *beentry = MyBEEntry;
2281
2282         if (!pgstat_track_activities || !beentry)
2283                 return;
2284
2285         /*
2286          * Since this is a single-byte field in a struct that only this process
2287          * may modify, there seems no need to bother with the st_changecount
2288          * protocol.  The update must appear atomic in any case.
2289          */
2290         beentry->st_waiting = waiting;
2291 }
2292
2293
2294 /* ----------
2295  * pgstat_read_current_status() -
2296  *
2297  *      Copy the current contents of the PgBackendStatus array to local memory,
2298  *      if not already done in this transaction.
2299  * ----------
2300  */
2301 static void
2302 pgstat_read_current_status(void)
2303 {
2304         volatile PgBackendStatus *beentry;
2305         PgBackendStatus *localtable;
2306         PgBackendStatus *localentry;
2307         char                    *localactivity;
2308         int                     i;
2309
2310         Assert(!pgStatRunningInCollector);
2311         if (localBackendStatusTable)
2312                 return;                                 /* already done */
2313
2314         pgstat_setup_memcxt();
2315
2316         localtable = (PgBackendStatus *)
2317                 MemoryContextAlloc(pgStatLocalContext,
2318                                                    sizeof(PgBackendStatus) * MaxBackends);
2319         localactivity = (char *)
2320                 MemoryContextAlloc(pgStatLocalContext,
2321                                                    pgstat_track_activity_query_size * MaxBackends);
2322         localNumBackends = 0;
2323
2324         beentry = BackendStatusArray;
2325         localentry = localtable;
2326         for (i = 1; i <= MaxBackends; i++)
2327         {
2328                 /*
2329                  * Follow the protocol of retrying if st_changecount changes while we
2330                  * copy the entry, or if it's odd.  (The check for odd is needed to
2331                  * cover the case where we are able to completely copy the entry while
2332                  * the source backend is between increment steps.)      We use a volatile
2333                  * pointer here to ensure the compiler doesn't try to get cute.
2334                  */
2335                 for (;;)
2336                 {
2337                         int                     save_changecount = beentry->st_changecount;
2338
2339                         localentry->st_procpid = beentry->st_procpid;
2340                         if (localentry->st_procpid > 0)
2341                         {
2342                                 memcpy(localentry, (char *) beentry, sizeof(PgBackendStatus));
2343                                 /*
2344                                  * strcpy is safe even if the string is modified concurrently,
2345                                  * because there's always a \0 at the end of the buffer.
2346                                  */
2347                                 strcpy(localactivity, (char *) beentry->st_activity);
2348                                 localentry->st_activity = localactivity;
2349                         }
2350
2351                         if (save_changecount == beentry->st_changecount &&
2352                                 (save_changecount & 1) == 0)
2353                                 break;
2354
2355                         /* Make sure we can break out of loop if stuck... */
2356                         CHECK_FOR_INTERRUPTS();
2357                 }
2358
2359                 beentry++;
2360                 /* Only valid entries get included into the local array */
2361                 if (localentry->st_procpid > 0)
2362                 {
2363                         localentry++;
2364                         localactivity += pgstat_track_activity_query_size;
2365                         localNumBackends++;
2366                 }
2367         }
2368
2369         /* Set the pointer only after completion of a valid table */
2370         localBackendStatusTable = localtable;
2371 }
2372
2373
2374 /* ----------
2375  * pgstat_get_backend_current_activity() -
2376  *
2377  *      Return a string representing the current activity of the backend with
2378  *      the specified PID.  This looks directly at the BackendStatusArray,
2379  *      and so will provide current information regardless of the age of our
2380  *      transaction's snapshot of the status array.
2381  *
2382  *      It is the caller's responsibility to invoke this only for backends whose
2383  *      state is expected to remain stable while the result is in use.  The
2384  *      only current use is in deadlock reporting, where we can expect that
2385  *      the target backend is blocked on a lock.  (There are corner cases
2386  *      where the target's wait could get aborted while we are looking at it,
2387  *      but the very worst consequence is to return a pointer to a string
2388  *      that's been changed, so we won't worry too much.)
2389  *
2390  *      Note: return strings for special cases match pg_stat_get_backend_activity.
2391  * ----------
2392  */
2393 const char *
2394 pgstat_get_backend_current_activity(int pid, bool checkUser)
2395 {
2396         PgBackendStatus *beentry;
2397         int                     i;
2398
2399         beentry = BackendStatusArray;
2400         for (i = 1; i <= MaxBackends; i++)
2401         {
2402                 /*
2403                  * Although we expect the target backend's entry to be stable, that
2404                  * doesn't imply that anyone else's is.  To avoid identifying the
2405                  * wrong backend, while we check for a match to the desired PID we
2406                  * must follow the protocol of retrying if st_changecount changes
2407                  * while we examine the entry, or if it's odd.  (This might be
2408                  * unnecessary, since fetching or storing an int is almost certainly
2409                  * atomic, but let's play it safe.)  We use a volatile pointer here
2410                  * to ensure the compiler doesn't try to get cute.
2411                  */
2412                 volatile PgBackendStatus *vbeentry = beentry;
2413                 bool    found;
2414
2415                 for (;;)
2416                 {
2417                         int                     save_changecount = vbeentry->st_changecount;
2418
2419                         found = (vbeentry->st_procpid == pid);
2420
2421                         if (save_changecount == vbeentry->st_changecount &&
2422                                 (save_changecount & 1) == 0)
2423                                 break;
2424
2425                         /* Make sure we can break out of loop if stuck... */
2426                         CHECK_FOR_INTERRUPTS();
2427                 }
2428
2429                 if (found)
2430                 {
2431                         /* Now it is safe to use the non-volatile pointer */
2432                         if (checkUser && !superuser() && beentry->st_userid != GetUserId())
2433                                 return "<insufficient privilege>";
2434                         else if (*(beentry->st_activity) == '\0')
2435                                 return "<command string not enabled>";
2436                         else
2437                                 return beentry->st_activity;
2438                 }
2439
2440                 beentry++;
2441         }
2442
2443         /* If we get here, caller is in error ... */
2444         return "<backend information not available>";
2445 }
2446
2447
2448 /* ------------------------------------------------------------
2449  * Local support functions follow
2450  * ------------------------------------------------------------
2451  */
2452
2453
2454 /* ----------
2455  * pgstat_setheader() -
2456  *
2457  *              Set common header fields in a statistics message
2458  * ----------
2459  */
2460 static void
2461 pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype)
2462 {
2463         hdr->m_type = mtype;
2464 }
2465
2466
2467 /* ----------
2468  * pgstat_send() -
2469  *
2470  *              Send out one statistics message to the collector
2471  * ----------
2472  */
2473 static void
2474 pgstat_send(void *msg, int len)
2475 {
2476         int                     rc;
2477
2478         if (pgStatSock < 0)
2479                 return;
2480
2481         ((PgStat_MsgHdr *) msg)->m_size = len;
2482
2483         /* We'll retry after EINTR, but ignore all other failures */
2484         do
2485         {
2486                 rc = send(pgStatSock, msg, len, 0);
2487         } while (rc < 0 && errno == EINTR);
2488
2489 #ifdef USE_ASSERT_CHECKING
2490         /* In debug builds, log send failures ... */
2491         if (rc < 0)
2492                 elog(LOG, "could not send to statistics collector: %m");
2493 #endif
2494 }
2495
2496 /* ----------
2497  * pgstat_send_bgwriter() -
2498  *
2499  *              Send bgwriter statistics to the collector
2500  * ----------
2501  */
2502 void
2503 pgstat_send_bgwriter(void)
2504 {
2505         /* We assume this initializes to zeroes */
2506         static const PgStat_MsgBgWriter all_zeroes;
2507
2508         /*
2509          * This function can be called even if nothing at all has happened. In
2510          * this case, avoid sending a completely empty message to the stats
2511          * collector.
2512          */
2513         if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
2514                 return;
2515
2516         /*
2517          * Prepare and send the message
2518          */
2519         pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
2520         pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
2521
2522         /*
2523          * Clear out the statistics buffer, so it can be re-used.
2524          */
2525         MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
2526 }
2527
2528
2529 /* ----------
2530  * PgstatCollectorMain() -
2531  *
2532  *      Start up the statistics collector process.      This is the body of the
2533  *      postmaster child process.
2534  *
2535  *      The argc/argv parameters are valid only in EXEC_BACKEND case.
2536  * ----------
2537  */
2538 NON_EXEC_STATIC void
2539 PgstatCollectorMain(int argc, char *argv[])
2540 {
2541         struct itimerval write_timeout;
2542         bool            need_timer = false;
2543         int                     len;
2544         PgStat_Msg      msg;
2545
2546 #ifndef WIN32
2547 #ifdef HAVE_POLL
2548         struct pollfd input_fd;
2549 #else
2550         struct timeval sel_timeout;
2551         fd_set          rfds;
2552 #endif
2553 #endif
2554
2555         IsUnderPostmaster = true;       /* we are a postmaster subprocess now */
2556
2557         MyProcPid = getpid();           /* reset MyProcPid */
2558
2559         MyStartTime = time(NULL);       /* record Start Time for logging */
2560
2561         /*
2562          * If possible, make this process a group leader, so that the postmaster
2563          * can signal any child processes too.  (pgstat probably never has any
2564          * child processes, but for consistency we make all postmaster child
2565          * processes do this.)
2566          */
2567 #ifdef HAVE_SETSID
2568         if (setsid() < 0)
2569                 elog(FATAL, "setsid() failed: %m");
2570 #endif
2571
2572         /*
2573          * Ignore all signals usually bound to some action in the postmaster,
2574          * except SIGQUIT and SIGALRM.
2575          */
2576         pqsignal(SIGHUP, pgstat_sighup_handler);
2577         pqsignal(SIGINT, SIG_IGN);
2578         pqsignal(SIGTERM, SIG_IGN);
2579         pqsignal(SIGQUIT, pgstat_exit);
2580         pqsignal(SIGALRM, force_statwrite);
2581         pqsignal(SIGPIPE, SIG_IGN);
2582         pqsignal(SIGUSR1, SIG_IGN);
2583         pqsignal(SIGUSR2, SIG_IGN);
2584         pqsignal(SIGCHLD, SIG_DFL);
2585         pqsignal(SIGTTIN, SIG_DFL);
2586         pqsignal(SIGTTOU, SIG_DFL);
2587         pqsignal(SIGCONT, SIG_DFL);
2588         pqsignal(SIGWINCH, SIG_DFL);
2589         PG_SETMASK(&UnBlockSig);
2590
2591         /*
2592          * Identify myself via ps
2593          */
2594         init_ps_display("stats collector process", "", "", "");
2595
2596         /*
2597          * Arrange to write the initial status file right away
2598          */
2599         need_statwrite = true;
2600
2601         /* Preset the delay between status file writes */
2602         MemSet(&write_timeout, 0, sizeof(struct itimerval));
2603         write_timeout.it_value.tv_sec = PGSTAT_STAT_INTERVAL / 1000;
2604         write_timeout.it_value.tv_usec = (PGSTAT_STAT_INTERVAL % 1000) * 1000;
2605
2606         /*
2607          * Read in an existing statistics stats file or initialize the stats to
2608          * zero.
2609          */
2610         pgStatRunningInCollector = true;
2611         pgStatDBHash = pgstat_read_statsfile(InvalidOid, true);
2612
2613         /*
2614          * Setup the descriptor set for select(2).      Since only one bit in the set
2615          * ever changes, we need not repeat FD_ZERO each time.
2616          */
2617 #if !defined(HAVE_POLL) && !defined(WIN32)
2618         FD_ZERO(&rfds);
2619 #endif
2620
2621         /*
2622          * Loop to process messages until we get SIGQUIT or detect ungraceful
2623          * death of our parent postmaster.
2624          *
2625          * For performance reasons, we don't want to do a PostmasterIsAlive() test
2626          * after every message; instead, do it at statwrite time and if
2627          * select()/poll() is interrupted by timeout.
2628          */
2629         for (;;)
2630         {
2631                 int                     got_data;
2632
2633                 /*
2634                  * Quit if we get SIGQUIT from the postmaster.
2635                  */
2636                 if (need_exit)
2637                         break;
2638
2639                 /*
2640                  * Reload configuration if we got SIGHUP from the postmaster.
2641                  * Also, signal a new write of the file, so we drop a new file as
2642                  * soon as possible of the directory for it changes.
2643                  */
2644                 if (got_SIGHUP)
2645                 {
2646                         ProcessConfigFile(PGC_SIGHUP);
2647                         got_SIGHUP = false;
2648                         need_statwrite = true;
2649                 }
2650
2651                 /*
2652                  * If time to write the stats file, do so.      Note that the alarm
2653                  * interrupt isn't re-enabled immediately, but only after we next
2654                  * receive a stats message; so no cycles are wasted when there is
2655                  * nothing going on.
2656                  */
2657                 if (need_statwrite)
2658                 {
2659                         /* Check for postmaster death; if so we'll write file below */
2660                         if (!PostmasterIsAlive(true))
2661                                 break;
2662
2663                         pgstat_write_statsfile(false);
2664                         need_statwrite = false;
2665                         need_timer = true;
2666                 }
2667
2668                 /*
2669                  * Wait for a message to arrive; but not for more than
2670                  * PGSTAT_SELECT_TIMEOUT seconds. (This determines how quickly we will
2671                  * shut down after an ungraceful postmaster termination; so it needn't
2672                  * be very fast.  However, on some systems SIGQUIT won't interrupt the
2673                  * poll/select call, so this also limits speed of response to SIGQUIT,
2674                  * which is more important.)
2675                  *
2676                  * We use poll(2) if available, otherwise select(2). Win32 has its own
2677                  * implementation.
2678                  */
2679 #ifndef WIN32
2680 #ifdef HAVE_POLL
2681                 input_fd.fd = pgStatSock;
2682                 input_fd.events = POLLIN | POLLERR;
2683                 input_fd.revents = 0;
2684
2685                 if (poll(&input_fd, 1, PGSTAT_SELECT_TIMEOUT * 1000) < 0)
2686                 {
2687                         if (errno == EINTR)
2688                                 continue;
2689                         ereport(ERROR,
2690                                         (errcode_for_socket_access(),
2691                                          errmsg("poll() failed in statistics collector: %m")));
2692                 }
2693
2694                 got_data = (input_fd.revents != 0);
2695 #else                                                   /* !HAVE_POLL */
2696
2697                 FD_SET(pgStatSock, &rfds);
2698
2699                 /*
2700                  * timeout struct is modified by select() on some operating systems,
2701                  * so re-fill it each time.
2702                  */
2703                 sel_timeout.tv_sec = PGSTAT_SELECT_TIMEOUT;
2704                 sel_timeout.tv_usec = 0;
2705
2706                 if (select(pgStatSock + 1, &rfds, NULL, NULL, &sel_timeout) < 0)
2707                 {
2708                         if (errno == EINTR)
2709                                 continue;
2710                         ereport(ERROR,
2711                                         (errcode_for_socket_access(),
2712                                          errmsg("select() failed in statistics collector: %m")));
2713                 }
2714
2715                 got_data = FD_ISSET(pgStatSock, &rfds);
2716 #endif   /* HAVE_POLL */
2717 #else                                                   /* WIN32 */
2718                 got_data = pgwin32_waitforsinglesocket(pgStatSock, FD_READ,
2719                                                                                            PGSTAT_SELECT_TIMEOUT * 1000);
2720 #endif
2721
2722                 /*
2723                  * If there is a message on the socket, read it and check for
2724                  * validity.
2725                  */
2726                 if (got_data)
2727                 {
2728                         len = recv(pgStatSock, (char *) &msg,
2729                                            sizeof(PgStat_Msg), 0);
2730                         if (len < 0)
2731                         {
2732                                 if (errno == EINTR)
2733                                         continue;
2734                                 ereport(ERROR,
2735                                                 (errcode_for_socket_access(),
2736                                                  errmsg("could not read statistics message: %m")));
2737                         }
2738
2739                         /*
2740                          * We ignore messages that are smaller than our common header
2741                          */
2742                         if (len < sizeof(PgStat_MsgHdr))
2743                                 continue;
2744
2745                         /*
2746                          * The received length must match the length in the header
2747                          */
2748                         if (msg.msg_hdr.m_size != len)
2749                                 continue;
2750
2751                         /*
2752                          * O.K. - we accept this message.  Process it.
2753                          */
2754                         switch (msg.msg_hdr.m_type)
2755                         {
2756                                 case PGSTAT_MTYPE_DUMMY:
2757                                         break;
2758
2759                                 case PGSTAT_MTYPE_TABSTAT:
2760                                         pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, len);
2761                                         break;
2762
2763                                 case PGSTAT_MTYPE_TABPURGE:
2764                                         pgstat_recv_tabpurge((PgStat_MsgTabpurge *) &msg, len);
2765                                         break;
2766
2767                                 case PGSTAT_MTYPE_DROPDB:
2768                                         pgstat_recv_dropdb((PgStat_MsgDropdb *) &msg, len);
2769                                         break;
2770
2771                                 case PGSTAT_MTYPE_RESETCOUNTER:
2772                                         pgstat_recv_resetcounter((PgStat_MsgResetcounter *) &msg,
2773                                                                                          len);
2774                                         break;
2775
2776                                 case PGSTAT_MTYPE_AUTOVAC_START:
2777                                         pgstat_recv_autovac((PgStat_MsgAutovacStart *) &msg, len);
2778                                         break;
2779
2780                                 case PGSTAT_MTYPE_VACUUM:
2781                                         pgstat_recv_vacuum((PgStat_MsgVacuum *) &msg, len);
2782                                         break;
2783
2784                                 case PGSTAT_MTYPE_ANALYZE:
2785                                         pgstat_recv_analyze((PgStat_MsgAnalyze *) &msg, len);
2786                                         break;
2787
2788                                 case PGSTAT_MTYPE_BGWRITER:
2789                                         pgstat_recv_bgwriter((PgStat_MsgBgWriter *) &msg, len);
2790                                         break;
2791
2792                                 case PGSTAT_MTYPE_FUNCSTAT:
2793                                         pgstat_recv_funcstat((PgStat_MsgFuncstat *) &msg, len);
2794                                         break;
2795
2796                                 case PGSTAT_MTYPE_FUNCPURGE:
2797                                         pgstat_recv_funcpurge((PgStat_MsgFuncpurge *) &msg, len);
2798                                         break;
2799
2800                                 default:
2801                                         break;
2802                         }
2803
2804                         /*
2805                          * If this is the first message after we wrote the stats file the
2806                          * last time, enable the alarm interrupt to make it be written
2807                          * again later.
2808                          */
2809                         if (need_timer)
2810                         {
2811                                 if (setitimer(ITIMER_REAL, &write_timeout, NULL))
2812                                         ereport(ERROR,
2813                                         (errmsg("could not set statistics collector timer: %m")));
2814                                 need_timer = false;
2815                         }
2816                 }
2817                 else
2818                 {
2819                         /*
2820                          * We can only get here if the select/poll timeout elapsed. Check
2821                          * for postmaster death.
2822                          */
2823                         if (!PostmasterIsAlive(true))
2824                                 break;
2825                 }
2826         }                                                       /* end of message-processing loop */
2827
2828         /*
2829          * Save the final stats to reuse at next startup.
2830          */
2831         pgstat_write_statsfile(true);
2832
2833         exit(0);
2834 }
2835
2836
2837 /* SIGQUIT signal handler for collector process */
2838 static void
2839 pgstat_exit(SIGNAL_ARGS)
2840 {
2841         need_exit = true;
2842 }
2843
2844 /* SIGALRM signal handler for collector process */
2845 static void
2846 force_statwrite(SIGNAL_ARGS)
2847 {
2848         need_statwrite = true;
2849 }
2850
2851 /* SIGHUP handler for collector process */
2852 static void
2853 pgstat_sighup_handler(SIGNAL_ARGS)
2854 {
2855         got_SIGHUP = true;
2856 }
2857
2858
2859 /*
2860  * Lookup the hash table entry for the specified database. If no hash
2861  * table entry exists, initialize it, if the create parameter is true.
2862  * Else, return NULL.
2863  */
2864 static PgStat_StatDBEntry *
2865 pgstat_get_db_entry(Oid databaseid, bool create)
2866 {
2867         PgStat_StatDBEntry *result;
2868         bool            found;
2869         HASHACTION      action = (create ? HASH_ENTER : HASH_FIND);
2870
2871         /* Lookup or create the hash table entry for this database */
2872         result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
2873                                                                                                 &databaseid,
2874                                                                                                 action, &found);
2875
2876         if (!create && !found)
2877                 return NULL;
2878
2879         /* If not found, initialize the new one. */
2880         if (!found)
2881         {
2882                 HASHCTL         hash_ctl;
2883
2884                 result->tables = NULL;
2885                 result->functions = NULL;
2886                 result->n_xact_commit = 0;
2887                 result->n_xact_rollback = 0;
2888                 result->n_blocks_fetched = 0;
2889                 result->n_blocks_hit = 0;
2890                 result->n_tuples_returned = 0;
2891                 result->n_tuples_fetched = 0;
2892                 result->n_tuples_inserted = 0;
2893                 result->n_tuples_updated = 0;
2894                 result->n_tuples_deleted = 0;
2895                 result->last_autovac_time = 0;
2896
2897                 memset(&hash_ctl, 0, sizeof(hash_ctl));
2898                 hash_ctl.keysize = sizeof(Oid);
2899                 hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
2900                 hash_ctl.hash = oid_hash;
2901                 result->tables = hash_create("Per-database table",
2902                                                                          PGSTAT_TAB_HASH_SIZE,
2903                                                                          &hash_ctl,
2904                                                                          HASH_ELEM | HASH_FUNCTION);
2905
2906                 hash_ctl.keysize = sizeof(Oid);
2907                 hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
2908                 hash_ctl.hash = oid_hash;
2909                 result->functions = hash_create("Per-database function",
2910                                                                                 PGSTAT_FUNCTION_HASH_SIZE,
2911                                                                                 &hash_ctl,
2912                                                                                 HASH_ELEM | HASH_FUNCTION);
2913         }
2914
2915         return result;
2916 }
2917
2918
2919 /* ----------
2920  * pgstat_write_statsfile() -
2921  *
2922  *      Tell the news.
2923  *      If writing to the permanent file (happens when the collector is
2924  *      shutting down only), remove the temporary file so that backends
2925  *      starting up under a new postmaster can't read the old data before
2926  *      the new collector is ready.
2927  * ----------
2928  */
2929 static void
2930 pgstat_write_statsfile(bool permanent)
2931 {
2932         HASH_SEQ_STATUS hstat;
2933         HASH_SEQ_STATUS tstat;
2934         HASH_SEQ_STATUS fstat;
2935         PgStat_StatDBEntry *dbentry;
2936         PgStat_StatTabEntry *tabentry;
2937         PgStat_StatFuncEntry *funcentry;
2938         FILE       *fpout;
2939         int32           format_id;
2940         const char *tmpfile = permanent?PGSTAT_STAT_PERMANENT_TMPFILE:pgstat_stat_tmpname;
2941         const char *statfile = permanent?PGSTAT_STAT_PERMANENT_FILENAME:pgstat_stat_filename;
2942
2943         /*
2944          * Open the statistics temp file to write out the current values.
2945          */
2946         fpout = fopen(tmpfile, PG_BINARY_W);
2947         if (fpout == NULL)
2948         {
2949                 ereport(LOG,
2950                                 (errcode_for_file_access(),
2951                                  errmsg("could not open temporary statistics file \"%s\": %m",
2952                                                 tmpfile)));
2953                 return;
2954         }
2955
2956         /*
2957          * Write the file header --- currently just a format ID.
2958          */
2959         format_id = PGSTAT_FILE_FORMAT_ID;
2960         fwrite(&format_id, sizeof(format_id), 1, fpout);
2961
2962         /*
2963          * Write global stats struct
2964          */
2965         fwrite(&globalStats, sizeof(globalStats), 1, fpout);
2966
2967         /*
2968          * Walk through the database table.
2969          */
2970         hash_seq_init(&hstat, pgStatDBHash);
2971         while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
2972         {
2973                 /*
2974                  * Write out the DB entry including the number of live backends. We
2975                  * don't write the tables or functions pointers, since they're of
2976                  * no use to any other process.
2977                  */
2978                 fputc('D', fpout);
2979                 fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
2980
2981                 /*
2982                  * Walk through the database's access stats per table.
2983                  */
2984                 hash_seq_init(&tstat, dbentry->tables);
2985                 while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
2986                 {
2987                         fputc('T', fpout);
2988                         fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
2989                 }
2990
2991                 /*
2992                  * Walk through the database's function stats table.
2993                  */
2994                 hash_seq_init(&fstat, dbentry->functions);
2995                 while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
2996                 {
2997                         fputc('F', fpout);
2998                         fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
2999                 }
3000
3001                 /*
3002                  * Mark the end of this DB
3003                  */
3004                 fputc('d', fpout);
3005         }
3006
3007         /*
3008          * No more output to be done. Close the temp file and replace the old
3009          * pgstat.stat with it.  The ferror() check replaces testing for error
3010          * after each individual fputc or fwrite above.
3011          */
3012         fputc('E', fpout);
3013
3014         if (ferror(fpout))
3015         {
3016                 ereport(LOG,
3017                                 (errcode_for_file_access(),
3018                            errmsg("could not write temporary statistics file \"%s\": %m",
3019                                           tmpfile)));
3020                 fclose(fpout);
3021                 unlink(tmpfile);
3022         }
3023         else if (fclose(fpout) < 0)
3024         {
3025                 ereport(LOG,
3026                                 (errcode_for_file_access(),
3027                            errmsg("could not close temporary statistics file \"%s\": %m",
3028                                           tmpfile)));
3029                 unlink(tmpfile);
3030         }
3031         else if (rename(tmpfile, statfile) < 0)
3032         {
3033                 ereport(LOG,
3034                                 (errcode_for_file_access(),
3035                                  errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
3036                                                 tmpfile, statfile)));
3037                 unlink(tmpfile);
3038         }
3039
3040         if (permanent)
3041                 unlink(pgstat_stat_filename);
3042 }
3043
3044
3045 /* ----------
3046  * pgstat_read_statsfile() -
3047  *
3048  *      Reads in an existing statistics collector file and initializes the
3049  *      databases' hash table (whose entries point to the tables' hash tables).
3050  * ----------
3051  */
3052 static HTAB *
3053 pgstat_read_statsfile(Oid onlydb, bool permanent)
3054 {
3055         PgStat_StatDBEntry *dbentry;
3056         PgStat_StatDBEntry dbbuf;
3057         PgStat_StatTabEntry *tabentry;
3058         PgStat_StatTabEntry tabbuf;
3059         PgStat_StatFuncEntry funcbuf;
3060         PgStat_StatFuncEntry *funcentry;
3061         HASHCTL         hash_ctl;
3062         HTAB       *dbhash;
3063         HTAB       *tabhash = NULL;
3064         HTAB       *funchash = NULL;
3065         FILE       *fpin;
3066         int32           format_id;
3067         bool            found;
3068         const char *statfile = permanent?PGSTAT_STAT_PERMANENT_FILENAME:pgstat_stat_filename;
3069
3070         /*
3071          * The tables will live in pgStatLocalContext.
3072          */
3073         pgstat_setup_memcxt();
3074
3075         /*
3076          * Create the DB hashtable
3077          */
3078         memset(&hash_ctl, 0, sizeof(hash_ctl));
3079         hash_ctl.keysize = sizeof(Oid);
3080         hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
3081         hash_ctl.hash = oid_hash;
3082         hash_ctl.hcxt = pgStatLocalContext;
3083         dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
3084                                                  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
3085
3086         /*
3087          * Clear out global statistics so they start from zero in case we can't
3088          * load an existing statsfile.
3089          */
3090         memset(&globalStats, 0, sizeof(globalStats));
3091
3092         /*
3093          * Try to open the status file. If it doesn't exist, the backends simply
3094          * return zero for anything and the collector simply starts from scratch
3095          * with empty counters.
3096          */
3097         if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
3098                 return dbhash;
3099
3100         /*
3101          * Verify it's of the expected format.
3102          */
3103         if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id)
3104                 || format_id != PGSTAT_FILE_FORMAT_ID)
3105         {
3106                 ereport(pgStatRunningInCollector ? LOG : WARNING,
3107                                 (errmsg("corrupted pgstat.stat file")));
3108                 goto done;
3109         }
3110
3111         /*
3112          * Read global stats struct
3113          */
3114         if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
3115         {
3116                 ereport(pgStatRunningInCollector ? LOG : WARNING,
3117                                 (errmsg("corrupted pgstat.stat file")));
3118                 goto done;
3119         }
3120
3121         /*
3122          * We found an existing collector stats file. Read it and put all the
3123          * hashtable entries into place.
3124          */
3125         for (;;)
3126         {
3127                 switch (fgetc(fpin))
3128                 {
3129                                 /*
3130                                  * 'D'  A PgStat_StatDBEntry struct describing a database
3131                                  * follows. Subsequently, zero to many 'T' and 'F' entries
3132                                  * will follow until a 'd' is encountered.
3133                                  */
3134                         case 'D':
3135                                 if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables),
3136                                                   fpin) != offsetof(PgStat_StatDBEntry, tables))
3137                                 {
3138                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
3139                                                         (errmsg("corrupted pgstat.stat file")));
3140                                         goto done;
3141                                 }
3142
3143                                 /*
3144                                  * Add to the DB hash
3145                                  */
3146                                 dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
3147                                                                                                   (void *) &dbbuf.databaseid,
3148                                                                                                                          HASH_ENTER,
3149                                                                                                                          &found);
3150                                 if (found)
3151                                 {
3152                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
3153                                                         (errmsg("corrupted pgstat.stat file")));
3154                                         goto done;
3155                                 }
3156
3157                                 memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
3158                                 dbentry->tables = NULL;
3159                                 dbentry->functions = NULL;
3160
3161                                 /*
3162                                  * Don't collect tables if not the requested DB (or the
3163                                  * shared-table info)
3164                                  */
3165                                 if (onlydb != InvalidOid)
3166                                 {
3167                                         if (dbbuf.databaseid != onlydb &&
3168                                                 dbbuf.databaseid != InvalidOid)
3169                                                 break;
3170                                 }
3171
3172                                 memset(&hash_ctl, 0, sizeof(hash_ctl));
3173                                 hash_ctl.keysize = sizeof(Oid);
3174                                 hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
3175                                 hash_ctl.hash = oid_hash;
3176                                 hash_ctl.hcxt = pgStatLocalContext;
3177                                 dbentry->tables = hash_create("Per-database table",
3178                                                                                           PGSTAT_TAB_HASH_SIZE,
3179                                                                                           &hash_ctl,
3180                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
3181
3182                                 hash_ctl.keysize = sizeof(Oid);
3183                                 hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
3184                                 hash_ctl.hash = oid_hash;
3185                                 hash_ctl.hcxt = pgStatLocalContext;
3186                                 dbentry->functions = hash_create("Per-database function",
3187                                                                                                  PGSTAT_FUNCTION_HASH_SIZE,
3188                                                                                                  &hash_ctl,
3189                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
3190                                 /*
3191                                  * Arrange that following records add entries to this
3192                                  * database's hash tables.
3193                                  */
3194                                 tabhash = dbentry->tables;
3195                                 funchash = dbentry->functions;
3196                                 break;
3197
3198                                 /*
3199                                  * 'd'  End of this database.
3200                                  */
3201                         case 'd':
3202                                 tabhash = NULL;
3203                                 funchash = NULL;
3204                                 break;
3205
3206                                 /*
3207                                  * 'T'  A PgStat_StatTabEntry follows.
3208                                  */
3209                         case 'T':
3210                                 if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry),
3211                                                   fpin) != sizeof(PgStat_StatTabEntry))
3212                                 {
3213                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
3214                                                         (errmsg("corrupted pgstat.stat file")));
3215                                         goto done;
3216                                 }
3217
3218                                 /*
3219                                  * Skip if table belongs to a not requested database.
3220                                  */
3221                                 if (tabhash == NULL)
3222                                         break;
3223
3224                                 tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
3225                                                                                                         (void *) &tabbuf.tableid,
3226                                                                                                                  HASH_ENTER, &found);
3227
3228                                 if (found)
3229                                 {
3230                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
3231                                                         (errmsg("corrupted pgstat.stat file")));
3232                                         goto done;
3233                                 }
3234
3235                                 memcpy(tabentry, &tabbuf, sizeof(tabbuf));
3236                                 break;
3237
3238                                 /*
3239                                  * 'F'  A PgStat_StatFuncEntry follows.
3240                                  */
3241                         case 'F':
3242                                 if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry),
3243                                                   fpin) != sizeof(PgStat_StatFuncEntry))
3244                                 {
3245                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
3246                                                         (errmsg("corrupted pgstat.stat file")));
3247                                         goto done;
3248                                 }
3249
3250                                 /*
3251                                  * Skip if function belongs to a not requested database.
3252                                  */
3253                                 if (funchash == NULL)
3254                                         break;
3255
3256                                 funcentry = (PgStat_StatFuncEntry *) hash_search(funchash,
3257                                                                                                         (void *) &funcbuf.functionid,
3258                                                                                                                  HASH_ENTER, &found);
3259
3260                                 if (found)
3261                                 {
3262                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
3263                                                         (errmsg("corrupted pgstat.stat file")));
3264                                         goto done;
3265                                 }
3266
3267                                 memcpy(funcentry, &funcbuf, sizeof(funcbuf));
3268                                 break;
3269
3270                                 /*
3271                                  * 'E'  The EOF marker of a complete stats file.
3272                                  */
3273                         case 'E':
3274                                 goto done;
3275
3276                         default:
3277                                 ereport(pgStatRunningInCollector ? LOG : WARNING,
3278                                                 (errmsg("corrupted pgstat.stat file")));
3279                                 goto done;
3280                 }
3281         }
3282
3283 done:
3284         FreeFile(fpin);
3285
3286         if (permanent)
3287                 unlink(PGSTAT_STAT_PERMANENT_FILENAME);
3288
3289         return dbhash;
3290 }
3291
3292 /*
3293  * If not already done, read the statistics collector stats file into
3294  * some hash tables.  The results will be kept until pgstat_clear_snapshot()
3295  * is called (typically, at end of transaction).
3296  */
3297 static void
3298 backend_read_statsfile(void)
3299 {
3300         /* already read it? */
3301         if (pgStatDBHash)
3302                 return;
3303         Assert(!pgStatRunningInCollector);
3304
3305         /* Autovacuum launcher wants stats about all databases */
3306         if (IsAutoVacuumLauncherProcess())
3307                 pgStatDBHash = pgstat_read_statsfile(InvalidOid, false);
3308         else
3309                 pgStatDBHash = pgstat_read_statsfile(MyDatabaseId, false);
3310 }
3311
3312
3313 /* ----------
3314  * pgstat_setup_memcxt() -
3315  *
3316  *      Create pgStatLocalContext, if not already done.
3317  * ----------
3318  */
3319 static void
3320 pgstat_setup_memcxt(void)
3321 {
3322         if (!pgStatLocalContext)
3323                 pgStatLocalContext = AllocSetContextCreate(TopMemoryContext,
3324                                                                                                    "Statistics snapshot",
3325                                                                                                    ALLOCSET_SMALL_MINSIZE,
3326                                                                                                    ALLOCSET_SMALL_INITSIZE,
3327                                                                                                    ALLOCSET_SMALL_MAXSIZE);
3328 }
3329
3330
3331 /* ----------
3332  * pgstat_clear_snapshot() -
3333  *
3334  *      Discard any data collected in the current transaction.  Any subsequent
3335  *      request will cause new snapshots to be read.
3336  *
3337  *      This is also invoked during transaction commit or abort to discard
3338  *      the no-longer-wanted snapshot.
3339  * ----------
3340  */
3341 void
3342 pgstat_clear_snapshot(void)
3343 {
3344         /* Release memory, if any was allocated */
3345         if (pgStatLocalContext)
3346                 MemoryContextDelete(pgStatLocalContext);
3347
3348         /* Reset variables */
3349         pgStatLocalContext = NULL;
3350         pgStatDBHash = NULL;
3351         localBackendStatusTable = NULL;
3352         localNumBackends = 0;
3353 }
3354
3355
3356 /* ----------
3357  * pgstat_recv_tabstat() -
3358  *
3359  *      Count what the backend has done.
3360  * ----------
3361  */
3362 static void
3363 pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
3364 {
3365         PgStat_TableEntry *tabmsg = &(msg->m_entry[0]);
3366         PgStat_StatDBEntry *dbentry;
3367         PgStat_StatTabEntry *tabentry;
3368         int                     i;
3369         bool            found;
3370
3371         dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
3372
3373         /*
3374          * Update database-wide stats.
3375          */
3376         dbentry->n_xact_commit += (PgStat_Counter) (msg->m_xact_commit);
3377         dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback);
3378
3379         /*
3380          * Process all table entries in the message.
3381          */
3382         for (i = 0; i < msg->m_nentries; i++)
3383         {
3384                 tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
3385                                                                                                   (void *) &(tabmsg[i].t_id),
3386                                                                                                            HASH_ENTER, &found);
3387
3388                 if (!found)
3389                 {
3390                         /*
3391                          * If it's a new table entry, initialize counters to the values we
3392                          * just got.
3393                          */
3394                         tabentry->numscans = tabmsg[i].t_counts.t_numscans;
3395                         tabentry->tuples_returned = tabmsg[i].t_counts.t_tuples_returned;
3396                         tabentry->tuples_fetched = tabmsg[i].t_counts.t_tuples_fetched;
3397                         tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted;
3398                         tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated;
3399                         tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted;
3400                         tabentry->tuples_hot_updated = tabmsg[i].t_counts.t_tuples_hot_updated;
3401                         tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples;
3402                         tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples;
3403                         tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched;
3404                         tabentry->blocks_hit = tabmsg[i].t_counts.t_blocks_hit;
3405
3406                         tabentry->last_anl_tuples = 0;
3407                         tabentry->vacuum_timestamp = 0;
3408                         tabentry->autovac_vacuum_timestamp = 0;
3409                         tabentry->analyze_timestamp = 0;
3410                         tabentry->autovac_analyze_timestamp = 0;
3411                 }
3412                 else
3413                 {
3414                         /*
3415                          * Otherwise add the values to the existing entry.
3416                          */
3417                         tabentry->numscans += tabmsg[i].t_counts.t_numscans;
3418                         tabentry->tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
3419                         tabentry->tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
3420                         tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
3421                         tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
3422                         tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
3423                         tabentry->tuples_hot_updated += tabmsg[i].t_counts.t_tuples_hot_updated;
3424                         tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples;
3425                         tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples;
3426                         tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
3427                         tabentry->blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
3428                 }
3429
3430                 /* Clamp n_live_tuples in case of negative new_live_tuples */
3431                 tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
3432                 /* Likewise for n_dead_tuples */
3433                 tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
3434
3435                 /*
3436                  * Add per-table stats to the per-database entry, too.
3437                  */
3438                 dbentry->n_tuples_returned += tabmsg[i].t_counts.t_tuples_returned;
3439                 dbentry->n_tuples_fetched += tabmsg[i].t_counts.t_tuples_fetched;
3440                 dbentry->n_tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
3441                 dbentry->n_tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
3442                 dbentry->n_tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
3443                 dbentry->n_blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
3444                 dbentry->n_blocks_hit += tabmsg[i].t_counts.t_blocks_hit;
3445         }
3446 }
3447
3448
3449 /* ----------
3450  * pgstat_recv_tabpurge() -
3451  *
3452  *      Arrange for dead table removal.
3453  * ----------
3454  */
3455 static void
3456 pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
3457 {
3458         PgStat_StatDBEntry *dbentry;
3459         int                     i;
3460
3461         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3462
3463         /*
3464          * No need to purge if we don't even know the database.
3465          */
3466         if (!dbentry || !dbentry->tables)
3467                 return;
3468
3469         /*
3470          * Process all table entries in the message.
3471          */
3472         for (i = 0; i < msg->m_nentries; i++)
3473         {
3474                 /* Remove from hashtable if present; we don't care if it's not. */
3475                 (void) hash_search(dbentry->tables,
3476                                                    (void *) &(msg->m_tableid[i]),
3477                                                    HASH_REMOVE, NULL);
3478         }
3479 }
3480
3481
3482 /* ----------
3483  * pgstat_recv_dropdb() -
3484  *
3485  *      Arrange for dead database removal
3486  * ----------
3487  */
3488 static void
3489 pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
3490 {
3491         PgStat_StatDBEntry *dbentry;
3492
3493         /*
3494          * Lookup the database in the hashtable.
3495          */
3496         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3497
3498         /*
3499          * If found, remove it.
3500          */
3501         if (dbentry)
3502         {
3503                 if (dbentry->tables != NULL)
3504                         hash_destroy(dbentry->tables);
3505                 if (dbentry->functions != NULL)
3506                         hash_destroy(dbentry->functions);
3507
3508                 if (hash_search(pgStatDBHash,
3509                                                 (void *) &(dbentry->databaseid),
3510                                                 HASH_REMOVE, NULL) == NULL)
3511                         ereport(ERROR,
3512                                         (errmsg("database hash table corrupted "
3513                                                         "during cleanup --- abort")));
3514         }
3515 }
3516
3517
3518 /* ----------
3519  * pgstat_recv_resetcounter() -
3520  *
3521  *      Reset the statistics for the specified database.
3522  * ----------
3523  */
3524 static void
3525 pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
3526 {
3527         HASHCTL         hash_ctl;
3528         PgStat_StatDBEntry *dbentry;
3529
3530         /*
3531          * Lookup the database in the hashtable.  Nothing to do if not there.
3532          */
3533         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3534
3535         if (!dbentry)
3536                 return;
3537
3538         /*
3539          * We simply throw away all the database's table entries by recreating a
3540          * new hash table for them.
3541          */
3542         if (dbentry->tables != NULL)
3543                 hash_destroy(dbentry->tables);
3544         if (dbentry->functions != NULL)
3545                 hash_destroy(dbentry->functions);
3546
3547         dbentry->tables = NULL;
3548         dbentry->functions = NULL;
3549         dbentry->n_xact_commit = 0;
3550         dbentry->n_xact_rollback = 0;
3551         dbentry->n_blocks_fetched = 0;
3552         dbentry->n_blocks_hit = 0;
3553
3554         memset(&hash_ctl, 0, sizeof(hash_ctl));
3555         hash_ctl.keysize = sizeof(Oid);
3556         hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
3557         hash_ctl.hash = oid_hash;
3558         dbentry->tables = hash_create("Per-database table",
3559                                                                   PGSTAT_TAB_HASH_SIZE,
3560                                                                   &hash_ctl,
3561                                                                   HASH_ELEM | HASH_FUNCTION);
3562
3563         hash_ctl.keysize = sizeof(Oid);
3564         hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
3565         hash_ctl.hash = oid_hash;
3566         dbentry->functions = hash_create("Per-database function",
3567                                                                          PGSTAT_FUNCTION_HASH_SIZE,
3568                                                                          &hash_ctl,
3569                                                                          HASH_ELEM | HASH_FUNCTION);
3570 }
3571
3572 /* ----------
3573  * pgstat_recv_autovac() -
3574  *
3575  *      Process an autovacuum signalling message.
3576  * ----------
3577  */
3578 static void
3579 pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
3580 {
3581         PgStat_StatDBEntry *dbentry;
3582
3583         /*
3584          * Lookup the database in the hashtable.  Don't create the entry if it
3585          * doesn't exist, because autovacuum may be processing a template
3586          * database.  If this isn't the case, the database is most likely to have
3587          * an entry already.  (If it doesn't, not much harm is done anyway --
3588          * it'll get created as soon as somebody actually uses the database.)
3589          */
3590         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3591         if (dbentry == NULL)
3592                 return;
3593
3594         /*
3595          * Store the last autovacuum time in the database entry.
3596          */
3597         dbentry->last_autovac_time = msg->m_start_time;
3598 }
3599
3600 /* ----------
3601  * pgstat_recv_vacuum() -
3602  *
3603  *      Process a VACUUM message.
3604  * ----------
3605  */
3606 static void
3607 pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
3608 {
3609         PgStat_StatDBEntry *dbentry;
3610         PgStat_StatTabEntry *tabentry;
3611
3612         /*
3613          * Don't create either the database or table entry if it doesn't already
3614          * exist.  This avoids bloating the stats with entries for stuff that is
3615          * only touched by vacuum and not by live operations.
3616          */
3617         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3618         if (dbentry == NULL)
3619                 return;
3620
3621         tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
3622                                                    HASH_FIND, NULL);
3623         if (tabentry == NULL)
3624                 return;
3625
3626         if (msg->m_autovacuum)
3627                 tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime;
3628         else
3629                 tabentry->vacuum_timestamp = msg->m_vacuumtime;
3630         tabentry->n_live_tuples = msg->m_tuples;
3631         /* Resetting dead_tuples to 0 is an approximation ... */
3632         tabentry->n_dead_tuples = 0;
3633         if (msg->m_analyze)
3634         {
3635                 tabentry->last_anl_tuples = msg->m_tuples;
3636                 if (msg->m_autovacuum)
3637                         tabentry->autovac_analyze_timestamp = msg->m_vacuumtime;
3638                 else
3639                         tabentry->analyze_timestamp = msg->m_vacuumtime;
3640         }
3641         else
3642         {
3643                 /* last_anl_tuples must never exceed n_live_tuples+n_dead_tuples */
3644                 tabentry->last_anl_tuples = Min(tabentry->last_anl_tuples,
3645                                                                                 msg->m_tuples);
3646         }
3647 }
3648
3649 /* ----------
3650  * pgstat_recv_analyze() -
3651  *
3652  *      Process an ANALYZE message.
3653  * ----------
3654  */
3655 static void
3656 pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
3657 {
3658         PgStat_StatDBEntry *dbentry;
3659         PgStat_StatTabEntry *tabentry;
3660
3661         /*
3662          * Don't create either the database or table entry if it doesn't already
3663          * exist.  This avoids bloating the stats with entries for stuff that is
3664          * only touched by analyze and not by live operations.
3665          */
3666         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3667         if (dbentry == NULL)
3668                 return;
3669
3670         tabentry = hash_search(dbentry->tables, &(msg->m_tableoid),
3671                                                    HASH_FIND, NULL);
3672         if (tabentry == NULL)
3673                 return;
3674
3675         if (msg->m_autovacuum)
3676                 tabentry->autovac_analyze_timestamp = msg->m_analyzetime;
3677         else
3678                 tabentry->analyze_timestamp = msg->m_analyzetime;
3679         tabentry->n_live_tuples = msg->m_live_tuples;
3680         tabentry->n_dead_tuples = msg->m_dead_tuples;
3681         tabentry->last_anl_tuples = msg->m_live_tuples + msg->m_dead_tuples;
3682 }
3683
3684
3685 /* ----------
3686  * pgstat_recv_bgwriter() -
3687  *
3688  *      Process a BGWRITER message.
3689  * ----------
3690  */
3691 static void
3692 pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
3693 {
3694         globalStats.timed_checkpoints += msg->m_timed_checkpoints;
3695         globalStats.requested_checkpoints += msg->m_requested_checkpoints;
3696         globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
3697         globalStats.buf_written_clean += msg->m_buf_written_clean;
3698         globalStats.maxwritten_clean += msg->m_maxwritten_clean;
3699         globalStats.buf_written_backend += msg->m_buf_written_backend;
3700         globalStats.buf_alloc += msg->m_buf_alloc;
3701 }
3702
3703 /* ----------
3704  * pgstat_recv_funcstat() -
3705  *
3706  *      Count what the backend has done.
3707  * ----------
3708  */
3709 static void
3710 pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
3711 {
3712         PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]);
3713         PgStat_StatDBEntry *dbentry;
3714         PgStat_StatFuncEntry *funcentry;
3715         int                     i;
3716         bool            found;
3717
3718         dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
3719
3720         /*
3721          * Process all function entries in the message.
3722          */
3723         for (i = 0; i < msg->m_nentries; i++, funcmsg++)
3724         {
3725                 funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
3726                                                                                                   (void *) &(funcmsg->f_id),
3727                                                                                                            HASH_ENTER, &found);
3728
3729                 if (!found)
3730                 {
3731                         /*
3732                          * If it's a new function entry, initialize counters to the values
3733                          * we just got.
3734                          */
3735                         funcentry->f_numcalls = funcmsg->f_numcalls;
3736                         funcentry->f_time = funcmsg->f_time;
3737                         funcentry->f_time_self = funcmsg->f_time_self;
3738                 }
3739                 else
3740                 {
3741                         /*
3742                          * Otherwise add the values to the existing entry.
3743                          */
3744                         funcentry->f_numcalls += funcmsg->f_numcalls;
3745                         funcentry->f_time += funcmsg->f_time;
3746                         funcentry->f_time_self += funcmsg->f_time_self;
3747                 }
3748         }
3749 }
3750
3751 /* ----------
3752  * pgstat_recv_funcpurge() -
3753  *
3754  *      Arrange for dead function removal.
3755  * ----------
3756  */
3757 static void
3758 pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len)
3759 {
3760         PgStat_StatDBEntry *dbentry;
3761         int                     i;
3762
3763         dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
3764
3765         /*
3766          * No need to purge if we don't even know the database.
3767          */
3768         if (!dbentry || !dbentry->functions)
3769                 return;
3770
3771         /*
3772          * Process all function entries in the message.
3773          */
3774         for (i = 0; i < msg->m_nentries; i++)
3775         {
3776                 /* Remove from hashtable if present; we don't care if it's not. */
3777                 (void) hash_search(dbentry->functions,
3778                                                    (void *) &(msg->m_functionid[i]),
3779                                                    HASH_REMOVE, NULL);
3780         }
3781 }