]> granicus.if.org Git - postgresql/blob - src/backend/postmaster/pgstat.c
Fix a couple of items that should be declared Oid not int. Purely
[postgresql] / src / backend / postmaster / pgstat.c
1 /* ----------
2  * pgstat.c
3  *
4  *      All the statistics collector stuff hacked up in one big, ugly file.
5  *
6  *      TODO:   - Separate collector, postmaster and backend stuff
7  *                        into different files.
8  *
9  *                      - Add some automatic call for pgstat vacuuming.
10  *
11  *                      - Add a pgstat config column to pg_database, so this
12  *                        entire thing can be enabled/disabled on a per db basis.
13  *
14  *      Copyright (c) 2001-2005, PostgreSQL Global Development Group
15  *
16  *      $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.96 2005/06/25 23:58:57 tgl Exp $
17  * ----------
18  */
19 #include "postgres.h"
20
21 #include <unistd.h>
22 #include <fcntl.h>
23 #include <sys/param.h>
24 #include <sys/time.h>
25 #include <sys/socket.h>
26 #include <netdb.h>
27 #include <netinet/in.h>
28 #include <arpa/inet.h>
29 #include <signal.h>
30 #include <time.h>
31
32 #include "pgstat.h"
33
34 #include "access/heapam.h"
35 #include "access/xact.h"
36 #include "catalog/pg_database.h"
37 #include "catalog/pg_shadow.h"
38 #include "libpq/libpq.h"
39 #include "libpq/pqsignal.h"
40 #include "mb/pg_wchar.h"
41 #include "miscadmin.h"
42 #include "postmaster/fork_process.h"
43 #include "postmaster/postmaster.h"
44 #include "storage/backendid.h"
45 #include "storage/fd.h"
46 #include "storage/ipc.h"
47 #include "storage/pg_shmem.h"
48 #include "storage/pmsignal.h"
49 #include "tcop/tcopprot.h"
50 #include "utils/hsearch.h"
51 #include "utils/memutils.h"
52 #include "utils/ps_status.h"
53 #include "utils/rel.h"
54 #include "utils/syscache.h"
55
56
57 /* ----------
58  * Paths for the statistics files. The %s is replaced with the
59  * installation's $PGDATA.
60  * ----------
61  */
62 #define PGSTAT_STAT_FILENAME    "%s/global/pgstat.stat"
63 #define PGSTAT_STAT_TMPFILE             "%s/global/pgstat.tmp.%d"
64
65 /* ----------
66  * Timer definitions.
67  * ----------
68  */
69 #define PGSTAT_STAT_INTERVAL    500             /* How often to write the status
70                                                                                  * file; in milliseconds. */
71
72 #define PGSTAT_DESTROY_DELAY    10000   /* How long to keep destroyed
73                                                                                  * objects known, to give delayed
74                                                                                  * UDP packets time to arrive; in
75                                                                                  * milliseconds. */
76
77 #define PGSTAT_DESTROY_COUNT    (PGSTAT_DESTROY_DELAY / PGSTAT_STAT_INTERVAL)
78
79 #define PGSTAT_RESTART_INTERVAL 60              /* How often to attempt to restart
80                                                                                  * a failed statistics collector;
81                                                                                  * in seconds. */
82
83 /* ----------
84  * Amount of space reserved in pgstat_recvbuffer().
85  * ----------
86  */
87 #define PGSTAT_RECVBUFFERSZ             ((int) (1024 * sizeof(PgStat_Msg)))
88
89 /* ----------
90  * The initial size hints for the hash tables used in the collector.
91  * ----------
92  */
93 #define PGSTAT_DB_HASH_SIZE             16
94 #define PGSTAT_BE_HASH_SIZE             512
95 #define PGSTAT_TAB_HASH_SIZE    512
96
97
98 /* ----------
99  * GUC parameters
100  * ----------
101  */
102 bool            pgstat_collect_startcollector = true;
103 bool            pgstat_collect_resetonpmstart = true;
104 bool            pgstat_collect_querystring = false;
105 bool            pgstat_collect_tuplelevel = false;
106 bool            pgstat_collect_blocklevel = false;
107
108 /* ----------
109  * Local data
110  * ----------
111  */
112 NON_EXEC_STATIC int pgStatSock = -1;
113 NON_EXEC_STATIC int pgStatPipe[2] = {-1,-1};
114 static struct sockaddr_storage pgStatAddr;
115 static pid_t pgStatCollectorPid = 0;
116
117 static time_t last_pgstat_start_time;
118
119 static long pgStatNumMessages = 0;
120
121 static bool pgStatRunningInCollector = FALSE;
122
123 static int      pgStatTabstatAlloc = 0;
124 static int      pgStatTabstatUsed = 0;
125 static PgStat_MsgTabstat **pgStatTabstatMessages = NULL;
126
127 #define TABSTAT_QUANTUM         4       /* we alloc this many at a time */
128
129 static int      pgStatXactCommit = 0;
130 static int      pgStatXactRollback = 0;
131
132 static TransactionId pgStatDBHashXact = InvalidTransactionId;
133 static HTAB *pgStatDBHash = NULL;
134 static HTAB *pgStatBeDead = NULL;
135 static PgStat_StatBeEntry *pgStatBeTable = NULL;
136 static int      pgStatNumBackends = 0;
137
138 static char pgStat_fname[MAXPGPATH];
139 static char pgStat_tmpfname[MAXPGPATH];
140
141
142 /* ----------
143  * Local function forward declarations
144  * ----------
145  */
146 #ifdef EXEC_BACKEND
147
148 typedef enum STATS_PROCESS_TYPE
149 {
150         STAT_PROC_BUFFER,
151         STAT_PROC_COLLECTOR
152 }       STATS_PROCESS_TYPE;
153
154 static pid_t pgstat_forkexec(STATS_PROCESS_TYPE procType);
155 static void pgstat_parseArgs(int argc, char *argv[]);
156 #endif
157
158 NON_EXEC_STATIC void PgstatBufferMain(int argc, char *argv[]);
159 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]);
160 static void pgstat_recvbuffer(void);
161 static void pgstat_exit(SIGNAL_ARGS);
162 static void pgstat_die(SIGNAL_ARGS);
163 static void pgstat_beshutdown_hook(int code, Datum arg);
164
165 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid);
166 static int      pgstat_add_backend(PgStat_MsgHdr *msg);
167 static void pgstat_sub_backend(int procpid);
168 static void pgstat_drop_database(Oid databaseid);
169 static void pgstat_write_statsfile(void);
170 static void pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
171                                           PgStat_StatBeEntry **betab,
172                                           int *numbackends);
173 static void backend_read_statsfile(void);
174
175 static void pgstat_setheader(PgStat_MsgHdr *hdr, int mtype);
176 static void pgstat_send(void *msg, int len);
177
178 static void pgstat_recv_bestart(PgStat_MsgBestart *msg, int len);
179 static void pgstat_recv_beterm(PgStat_MsgBeterm *msg, int len);
180 static void pgstat_recv_activity(PgStat_MsgActivity *msg, int len);
181 static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
182 static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
183 static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
184 static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
185
186
187 /* ------------------------------------------------------------
188  * Public functions called from postmaster follow
189  * ------------------------------------------------------------
190  */
191
192 /* ----------
193  * pgstat_init() -
194  *
195  *      Called from postmaster at startup. Create the resources required
196  *      by the statistics collector process.  If unable to do so, do not
197  *      fail --- better to let the postmaster start with stats collection
198  *      disabled.
199  * ----------
200  */
201 void
202 pgstat_init(void)
203 {
204         ACCEPT_TYPE_ARG3 alen;
205         struct addrinfo *addrs = NULL,
206                            *addr,
207                                 hints;
208         int                     ret;
209         fd_set          rset;
210         struct timeval tv;
211         char            test_byte;
212         int                     sel_res;
213
214 #define TESTBYTEVAL ((char) 199)
215
216         /*
217          * Force start of collector daemon if something to collect
218          */
219         if (pgstat_collect_querystring ||
220                 pgstat_collect_tuplelevel ||
221                 pgstat_collect_blocklevel)
222                 pgstat_collect_startcollector = true;
223
224         /*
225          * Initialize the filename for the status reports.      (In the
226          * EXEC_BACKEND case, this only sets the value in the postmaster.  The
227          * collector subprocess will recompute the value for itself, and
228          * individual backends must do so also if they want to access the
229          * file.)
230          */
231         snprintf(pgStat_fname, MAXPGPATH, PGSTAT_STAT_FILENAME, DataDir);
232
233         /*
234          * If we don't have to start a collector or should reset the collected
235          * statistics on postmaster start, simply remove the file.
236          */
237         if (!pgstat_collect_startcollector || pgstat_collect_resetonpmstart)
238                 unlink(pgStat_fname);
239
240         /*
241          * Nothing else required if collector will not get started
242          */
243         if (!pgstat_collect_startcollector)
244                 return;
245
246         /*
247          * Create the UDP socket for sending and receiving statistic messages
248          */
249         hints.ai_flags = AI_PASSIVE;
250         hints.ai_family = PF_UNSPEC;
251         hints.ai_socktype = SOCK_DGRAM;
252         hints.ai_protocol = 0;
253         hints.ai_addrlen = 0;
254         hints.ai_addr = NULL;
255         hints.ai_canonname = NULL;
256         hints.ai_next = NULL;
257         ret = getaddrinfo_all("localhost", NULL, &hints, &addrs);
258         if (ret || !addrs)
259         {
260                 ereport(LOG,
261                                 (errmsg("could not resolve \"localhost\": %s",
262                                                 gai_strerror(ret))));
263                 goto startup_failed;
264         }
265
266         /*
267          * On some platforms, getaddrinfo_all() may return multiple addresses
268          * only one of which will actually work (eg, both IPv6 and IPv4
269          * addresses when kernel will reject IPv6).  Worse, the failure may
270          * occur at the bind() or perhaps even connect() stage.  So we must
271          * loop through the results till we find a working combination.  We
272          * will generate LOG messages, but no error, for bogus combinations.
273          */
274         for (addr = addrs; addr; addr = addr->ai_next)
275         {
276 #ifdef HAVE_UNIX_SOCKETS
277                 /* Ignore AF_UNIX sockets, if any are returned. */
278                 if (addr->ai_family == AF_UNIX)
279                         continue;
280 #endif
281
282                 /*
283                  * Create the socket.
284                  */
285                 if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) < 0)
286                 {
287                         ereport(LOG,
288                                         (errcode_for_socket_access(),
289                                          errmsg("could not create socket for statistics collector: %m")));
290                         continue;
291                 }
292
293                 /*
294                  * Bind it to a kernel assigned port on localhost and get the
295                  * assigned port via getsockname().
296                  */
297                 if (bind(pgStatSock, addr->ai_addr, addr->ai_addrlen) < 0)
298                 {
299                         ereport(LOG,
300                                         (errcode_for_socket_access(),
301                                          errmsg("could not bind socket for statistics collector: %m")));
302                         closesocket(pgStatSock);
303                         pgStatSock = -1;
304                         continue;
305                 }
306
307                 alen = sizeof(pgStatAddr);
308                 if (getsockname(pgStatSock, (struct sockaddr *) & pgStatAddr, &alen) < 0)
309                 {
310                         ereport(LOG,
311                                         (errcode_for_socket_access(),
312                                          errmsg("could not get address of socket for statistics collector: %m")));
313                         closesocket(pgStatSock);
314                         pgStatSock = -1;
315                         continue;
316                 }
317
318                 /*
319                  * Connect the socket to its own address.  This saves a few cycles
320                  * by not having to respecify the target address on every send.
321                  * This also provides a kernel-level check that only packets from
322                  * this same address will be received.
323                  */
324                 if (connect(pgStatSock, (struct sockaddr *) & pgStatAddr, alen) < 0)
325                 {
326                         ereport(LOG,
327                                         (errcode_for_socket_access(),
328                                          errmsg("could not connect socket for statistics collector: %m")));
329                         closesocket(pgStatSock);
330                         pgStatSock = -1;
331                         continue;
332                 }
333
334                 /*
335                  * Try to send and receive a one-byte test message on the socket.
336                  * This is to catch situations where the socket can be created but
337                  * will not actually pass data (for instance, because kernel
338                  * packet filtering rules prevent it).
339                  */
340                 test_byte = TESTBYTEVAL;
341                 if (send(pgStatSock, &test_byte, 1, 0) != 1)
342                 {
343                         ereport(LOG,
344                                         (errcode_for_socket_access(),
345                                          errmsg("could not send test message on socket for statistics collector: %m")));
346                         closesocket(pgStatSock);
347                         pgStatSock = -1;
348                         continue;
349                 }
350
351                 /*
352                  * There could possibly be a little delay before the message can
353                  * be received.  We arbitrarily allow up to half a second before
354                  * deciding it's broken.
355                  */
356                 for (;;)                                /* need a loop to handle EINTR */
357                 {
358                         FD_ZERO(&rset);
359                         FD_SET(pgStatSock, &rset);
360                         tv.tv_sec = 0;
361                         tv.tv_usec = 500000;
362                         sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv);
363                         if (sel_res >= 0 || errno != EINTR)
364                                 break;
365                 }
366                 if (sel_res < 0)
367                 {
368                         ereport(LOG,
369                                         (errcode_for_socket_access(),
370                                  errmsg("select() failed in statistics collector: %m")));
371                         closesocket(pgStatSock);
372                         pgStatSock = -1;
373                         continue;
374                 }
375                 if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset))
376                 {
377                         /*
378                          * This is the case we actually think is likely, so take pains
379                          * to give a specific message for it.
380                          *
381                          * errno will not be set meaningfully here, so don't use it.
382                          */
383                         ereport(LOG,
384                                         (errcode(ERRCODE_CONNECTION_FAILURE),
385                                          errmsg("test message did not get through on socket for statistics collector")));
386                         closesocket(pgStatSock);
387                         pgStatSock = -1;
388                         continue;
389                 }
390
391                 test_byte++;                    /* just make sure variable is changed */
392
393                 if (recv(pgStatSock, &test_byte, 1, 0) != 1)
394                 {
395                         ereport(LOG,
396                                         (errcode_for_socket_access(),
397                                          errmsg("could not receive test message on socket for statistics collector: %m")));
398                         closesocket(pgStatSock);
399                         pgStatSock = -1;
400                         continue;
401                 }
402
403                 if (test_byte != TESTBYTEVAL)   /* strictly paranoia ... */
404                 {
405                         ereport(LOG,
406                                         (errcode(ERRCODE_INTERNAL_ERROR),
407                                          errmsg("incorrect test message transmission on socket for statistics collector")));
408                         closesocket(pgStatSock);
409                         pgStatSock = -1;
410                         continue;
411                 }
412
413                 /* If we get here, we have a working socket */
414                 break;
415         }
416
417         /* Did we find a working address? */
418         if (!addr || pgStatSock < 0)
419                 goto startup_failed;
420
421         /*
422          * Set the socket to non-blocking IO.  This ensures that if the
423          * collector falls behind (despite the buffering process), statistics
424          * messages will be discarded; backends won't block waiting to send
425          * messages to the collector.
426          */
427         if (!pg_set_noblock(pgStatSock))
428         {
429                 ereport(LOG,
430                                 (errcode_for_socket_access(),
431                                  errmsg("could not set statistics collector socket to nonblocking mode: %m")));
432                 goto startup_failed;
433         }
434
435         freeaddrinfo_all(hints.ai_family, addrs);
436
437         return;
438
439 startup_failed:
440         ereport(LOG,
441                         (errmsg("disabling statistics collector for lack of working socket")));
442
443         if (addrs)
444                 freeaddrinfo_all(hints.ai_family, addrs);
445
446         if (pgStatSock >= 0)
447                 closesocket(pgStatSock);
448         pgStatSock = -1;
449
450         /* Adjust GUC variables to suppress useless activity */
451         pgstat_collect_startcollector = false;
452         pgstat_collect_querystring = false;
453         pgstat_collect_tuplelevel = false;
454         pgstat_collect_blocklevel = false;
455 }
456
457
458 #ifdef EXEC_BACKEND
459
460 /*
461  * pgstat_forkexec() -
462  *
463  * Format up the arglist for, then fork and exec, statistics
464  * (buffer and collector) processes
465  */
466 static pid_t
467 pgstat_forkexec(STATS_PROCESS_TYPE procType)
468 {
469         char       *av[10];
470         int                     ac = 0,
471                                 bufc = 0,
472                                 i;
473         char            pgstatBuf[2][32];
474
475         av[ac++] = "postgres";
476
477         switch (procType)
478         {
479                 case STAT_PROC_BUFFER:
480                         av[ac++] = "-forkbuf";
481                         break;
482
483                 case STAT_PROC_COLLECTOR:
484                         av[ac++] = "-forkcol";
485                         break;
486
487                 default:
488                         Assert(false);
489         }
490
491         av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
492
493         /* postgres_exec_path is not passed by write_backend_variables */
494         av[ac++] = postgres_exec_path;
495
496         /* Add to the arg list */
497         Assert(bufc <= lengthof(pgstatBuf));
498         for (i = 0; i < bufc; i++)
499                 av[ac++] = pgstatBuf[i];
500
501         av[ac] = NULL;
502         Assert(ac < lengthof(av));
503
504         return postmaster_forkexec(ac, av);
505 }
506
507
508 /*
509  * pgstat_parseArgs() -
510  *
511  * Extract data from the arglist for exec'ed statistics
512  * (buffer and collector) processes
513  */
514 static void
515 pgstat_parseArgs(int argc, char *argv[])
516 {
517         Assert(argc == 4);
518
519         argc = 3;
520         StrNCpy(postgres_exec_path, argv[argc++], MAXPGPATH);
521 }
522 #endif   /* EXEC_BACKEND */
523
524
525 /* ----------
526  * pgstat_start() -
527  *
528  *      Called from postmaster at startup or after an existing collector
529  *      died.  Attempt to fire up a fresh statistics collector.
530  *
531  *      Returns PID of child process, or 0 if fail.
532  *
533  *      Note: if fail, we will be called again from the postmaster main loop.
534  * ----------
535  */
536 int
537 pgstat_start(void)
538 {
539         time_t          curtime;
540         pid_t           pgStatPid;
541
542         /*
543          * Do nothing if no collector needed
544          */
545         if (!pgstat_collect_startcollector)
546                 return 0;
547
548         /*
549          * Do nothing if too soon since last collector start.  This is a
550          * safety valve to protect against continuous respawn attempts if the
551          * collector is dying immediately at launch.  Note that since we will
552          * be re-called from the postmaster main loop, we will get another
553          * chance later.
554          */
555         curtime = time(NULL);
556         if ((unsigned int) (curtime - last_pgstat_start_time) <
557                 (unsigned int) PGSTAT_RESTART_INTERVAL)
558                 return 0;
559         last_pgstat_start_time = curtime;
560
561         /*
562          * Check that the socket is there, else pgstat_init failed.
563          */
564         if (pgStatSock < 0)
565         {
566                 ereport(LOG,
567                                 (errmsg("statistics collector startup skipped")));
568
569                 /*
570                  * We can only get here if someone tries to manually turn
571                  * pgstat_collect_startcollector on after it had been off.
572                  */
573                 pgstat_collect_startcollector = false;
574                 return 0;
575         }
576
577         /*
578          * Okay, fork off the collector.
579          */
580 #ifdef EXEC_BACKEND
581         switch ((pgStatPid = pgstat_forkexec(STAT_PROC_BUFFER)))
582 #else
583         switch ((pgStatPid = fork_process()))
584 #endif
585         {
586                 case -1:
587                         ereport(LOG,
588                                         (errmsg("could not fork statistics buffer: %m")));
589                         return 0;
590
591 #ifndef EXEC_BACKEND
592                 case 0:
593                         /* in postmaster child ... */
594                         /* Close the postmaster's sockets */
595                         ClosePostmasterPorts(false);
596
597                         /* Drop our connection to postmaster's shared memory, as well */
598                         PGSharedMemoryDetach();
599
600                         PgstatBufferMain(0, NULL);
601                         break;
602 #endif
603
604                 default:
605                         return (int) pgStatPid;
606         }
607
608         /* shouldn't get here */
609         return 0;
610 }
611
612
613 /* ----------
614  * pgstat_beterm() -
615  *
616  *      Called from postmaster to tell collector a backend terminated.
617  * ----------
618  */
619 void
620 pgstat_beterm(int pid)
621 {
622         PgStat_MsgBeterm msg;
623
624         if (pgStatSock < 0)
625                 return;
626
627         MemSet(&(msg.m_hdr), 0, sizeof(msg.m_hdr));
628         msg.m_hdr.m_type = PGSTAT_MTYPE_BETERM;
629         msg.m_hdr.m_procpid = pid;
630
631         pgstat_send(&msg, sizeof(msg));
632 }
633
634
635 /* ------------------------------------------------------------
636  * Public functions used by backends follow
637  *------------------------------------------------------------
638  */
639
640
641 /* ----------
642  * pgstat_bestart() -
643  *
644  *      Tell the collector that this new backend is soon ready to process
645  *      queries. Called from tcop/postgres.c before entering the mainloop.
646  * ----------
647  */
648 void
649 pgstat_bestart(void)
650 {
651         PgStat_MsgBestart msg;
652
653         if (pgStatSock < 0)
654                 return;
655
656         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_BESTART);
657         msg.m_databaseid = MyDatabaseId;
658         msg.m_userid = GetSessionUserId();
659         memcpy(&msg.m_clientaddr, &MyProcPort->raddr, sizeof(msg.m_clientaddr));
660         pgstat_send(&msg, sizeof(msg));
661
662         /*
663          * Set up a process-exit hook to ensure we flush the last batch of
664          * statistics to the collector.
665          */
666         on_proc_exit(pgstat_beshutdown_hook, 0);
667 }
668
669 /*
670  * Flush any remaining statistics counts out to the collector at process
671  * exit.   Without this, operations triggered during backend exit (such as
672  * temp table deletions) won't be counted.  This is an on_proc_exit hook,
673  * not on_shmem_exit, so that everything interesting must have happened
674  * already.
675  */
676 static void
677 pgstat_beshutdown_hook(int code, Datum arg)
678 {
679         pgstat_report_tabstat();
680 }
681
682
683 /* ----------
684  * pgstat_report_activity() -
685  *
686  *      Called from tcop/postgres.c to tell the collector what the backend
687  *      is actually doing (usually "<IDLE>" or the start of the query to
688  *      be executed).
689  * ----------
690  */
691 void
692 pgstat_report_activity(const char *what)
693 {
694         PgStat_MsgActivity msg;
695         int                     len;
696
697         if (!pgstat_collect_querystring || pgStatSock < 0)
698                 return;
699
700         len = strlen(what);
701         len = pg_mbcliplen((const unsigned char *) what, len,
702                                            PGSTAT_ACTIVITY_SIZE - 1);
703
704         memcpy(msg.m_what, what, len);
705         msg.m_what[len] = '\0';
706         len += offsetof(PgStat_MsgActivity, m_what) +1;
707
708         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ACTIVITY);
709         pgstat_send(&msg, len);
710 }
711
712
713 /* ----------
714  * pgstat_report_tabstat() -
715  *
716  *      Called from tcop/postgres.c to send the so far collected
717  *      per table access statistics to the collector.
718  * ----------
719  */
720 void
721 pgstat_report_tabstat(void)
722 {
723         int                     i;
724
725         if (pgStatSock < 0 ||
726                 !(pgstat_collect_querystring ||
727                   pgstat_collect_tuplelevel ||
728                   pgstat_collect_blocklevel))
729         {
730                 /* Not reporting stats, so just flush whatever we have */
731                 pgStatTabstatUsed = 0;
732                 return;
733         }
734
735         /*
736          * For each message buffer used during the last query set the header
737          * fields and send it out.
738          */
739         for (i = 0; i < pgStatTabstatUsed; i++)
740         {
741                 PgStat_MsgTabstat *tsmsg = pgStatTabstatMessages[i];
742                 int                     n;
743                 int                     len;
744
745                 n = tsmsg->m_nentries;
746                 len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
747                         n * sizeof(PgStat_TableEntry);
748
749                 tsmsg->m_xact_commit = pgStatXactCommit;
750                 tsmsg->m_xact_rollback = pgStatXactRollback;
751                 pgStatXactCommit = 0;
752                 pgStatXactRollback = 0;
753
754                 pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
755                 tsmsg->m_databaseid = MyDatabaseId;
756                 pgstat_send(tsmsg, len);
757         }
758
759         pgStatTabstatUsed = 0;
760 }
761
762
763 /* ----------
764  * pgstat_vacuum_tabstat() -
765  *
766  *      Will tell the collector about objects he can get rid of.
767  * ----------
768  */
769 int
770 pgstat_vacuum_tabstat(void)
771 {
772         Relation        dbrel;
773         HeapScanDesc dbscan;
774         HeapTuple       dbtup;
775         Oid                *dbidlist;
776         int                     dbidalloc;
777         int                     dbidused;
778         HASH_SEQ_STATUS hstat;
779         PgStat_StatDBEntry *dbentry;
780         PgStat_StatTabEntry *tabentry;
781         HeapTuple       reltup;
782         int                     nobjects = 0;
783         PgStat_MsgTabpurge msg;
784         int                     len;
785         int                     i;
786
787         if (pgStatSock < 0)
788                 return 0;
789
790         /*
791          * If not done for this transaction, read the statistics collector
792          * stats file into some hash tables.
793          */
794         backend_read_statsfile();
795
796         /*
797          * Lookup our own database entry
798          */
799         dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
800                                                                                                  (void *) &MyDatabaseId,
801                                                                                                  HASH_FIND, NULL);
802         if (dbentry == NULL)
803                 return -1;
804
805         if (dbentry->tables == NULL)
806                 return 0;
807
808         /*
809          * Initialize our messages table counter to zero
810          */
811         msg.m_nentries = 0;
812
813         /*
814          * Check for all tables if they still exist.
815          */
816         hash_seq_init(&hstat, dbentry->tables);
817         while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
818         {
819                 /*
820                  * Check if this relation is still alive by looking up it's
821                  * pg_class tuple in the system catalog cache.
822                  */
823                 reltup = SearchSysCache(RELOID,
824                                                                 ObjectIdGetDatum(tabentry->tableid),
825                                                                 0, 0, 0);
826                 if (HeapTupleIsValid(reltup))
827                 {
828                         ReleaseSysCache(reltup);
829                         continue;
830                 }
831
832                 /*
833                  * Add this table's Oid to the message
834                  */
835                 msg.m_tableid[msg.m_nentries++] = tabentry->tableid;
836                 nobjects++;
837
838                 /*
839                  * If the message is full, send it out and reinitialize ot zero
840                  */
841                 if (msg.m_nentries >= PGSTAT_NUM_TABPURGE)
842                 {
843                         len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
844                                 +msg.m_nentries * sizeof(Oid);
845
846                         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
847                         pgstat_send(&msg, len);
848
849                         msg.m_nentries = 0;
850                 }
851         }
852
853         /*
854          * Send the rest
855          */
856         if (msg.m_nentries > 0)
857         {
858                 len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
859                         +msg.m_nentries * sizeof(Oid);
860
861                 pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
862                 msg.m_databaseid = MyDatabaseId;
863                 pgstat_send(&msg, len);
864         }
865
866         /*
867          * Read pg_database and remember the Oid's of all existing databases
868          */
869         dbidalloc = 256;
870         dbidused = 0;
871         dbidlist = (Oid *) palloc(sizeof(Oid) * dbidalloc);
872
873         dbrel = heap_open(DatabaseRelationId, AccessShareLock);
874         dbscan = heap_beginscan(dbrel, SnapshotNow, 0, NULL);
875         while ((dbtup = heap_getnext(dbscan, ForwardScanDirection)) != NULL)
876         {
877                 if (dbidused >= dbidalloc)
878                 {
879                         dbidalloc *= 2;
880                         dbidlist = (Oid *) repalloc((char *) dbidlist,
881                                                                                 sizeof(Oid) * dbidalloc);
882                 }
883                 dbidlist[dbidused++] = HeapTupleGetOid(dbtup);
884         }
885         heap_endscan(dbscan);
886         heap_close(dbrel, AccessShareLock);
887
888         /*
889          * Search the database hash table for dead databases and tell the
890          * collector to drop them as well.
891          */
892         hash_seq_init(&hstat, pgStatDBHash);
893         while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
894         {
895                 Oid                     dbid = dbentry->databaseid;
896
897                 for (i = 0; i < dbidused; i++)
898                 {
899                         if (dbidlist[i] == dbid)
900                         {
901                                 dbid = InvalidOid;
902                                 break;
903                         }
904                 }
905
906                 if (dbid != InvalidOid)
907                 {
908                         nobjects++;
909                         pgstat_drop_database(dbid);
910                 }
911         }
912
913         /*
914          * Free the dbid list.
915          */
916         pfree(dbidlist);
917
918         /*
919          * Tell the caller how many removeable objects we found
920          */
921         return nobjects;
922 }
923
924
925 /* ----------
926  * pgstat_drop_database() -
927  *
928  *      Tell the collector that we just dropped a database.
929  *      This is the only message that shouldn't get lost in space. Otherwise
930  *      the collector will keep the statistics for the dead DB until his
931  *      stats file got removed while the postmaster is down.
932  * ----------
933  */
934 static void
935 pgstat_drop_database(Oid databaseid)
936 {
937         PgStat_MsgDropdb msg;
938
939         if (pgStatSock < 0)
940                 return;
941
942         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB);
943         msg.m_databaseid = databaseid;
944         pgstat_send(&msg, sizeof(msg));
945 }
946
947
948 /* ----------
949  * pgstat_reset_counters() -
950  *
951  *      Tell the statistics collector to reset counters for our database.
952  * ----------
953  */
954 void
955 pgstat_reset_counters(void)
956 {
957         PgStat_MsgResetcounter msg;
958
959         if (pgStatSock < 0)
960                 return;
961
962         if (!superuser())
963                 ereport(ERROR,
964                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
965                           errmsg("must be superuser to reset statistics counters")));
966
967         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETCOUNTER);
968         msg.m_databaseid = MyDatabaseId;
969         pgstat_send(&msg, sizeof(msg));
970 }
971
972
973 /* ----------
974  * pgstat_ping() -
975  *
976  *      Send some junk data to the collector to increase traffic.
977  * ----------
978  */
979 void
980 pgstat_ping(void)
981 {
982         PgStat_MsgDummy msg;
983
984         if (pgStatSock < 0)
985                 return;
986
987         pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY);
988         pgstat_send(&msg, sizeof(msg));
989 }
990
991 /*
992  * Create or enlarge the pgStatTabstatMessages array
993  */
994 static void
995 more_tabstat_space(void)
996 {
997         PgStat_MsgTabstat *newMessages;
998         PgStat_MsgTabstat **msgArray;
999         int                     newAlloc = pgStatTabstatAlloc + TABSTAT_QUANTUM;
1000         int                     i;
1001
1002         /* Create (another) quantum of message buffers */
1003         newMessages = (PgStat_MsgTabstat *)
1004                 MemoryContextAllocZero(TopMemoryContext,
1005                                                            sizeof(PgStat_MsgTabstat) * TABSTAT_QUANTUM);
1006
1007         /* Create or enlarge the pointer array */
1008         if (pgStatTabstatMessages == NULL)
1009                 msgArray = (PgStat_MsgTabstat **)
1010                         MemoryContextAlloc(TopMemoryContext,
1011                                                            sizeof(PgStat_MsgTabstat *) * newAlloc);
1012         else
1013                 msgArray = (PgStat_MsgTabstat **)
1014                         repalloc(pgStatTabstatMessages,
1015                                          sizeof(PgStat_MsgTabstat *) * newAlloc);
1016
1017         for (i = 0; i < TABSTAT_QUANTUM; i++)
1018                 msgArray[pgStatTabstatAlloc + i] = newMessages++;
1019         pgStatTabstatMessages = msgArray;
1020         pgStatTabstatAlloc = newAlloc;
1021
1022         Assert(pgStatTabstatUsed < pgStatTabstatAlloc);
1023 }
1024
1025 /* ----------
1026  * pgstat_initstats() -
1027  *
1028  *      Called from various places usually dealing with initialization
1029  *      of Relation or Scan structures. The data placed into these
1030  *      structures from here tell where later to count for buffer reads,
1031  *      scans and tuples fetched.
1032  * ----------
1033  */
1034 void
1035 pgstat_initstats(PgStat_Info *stats, Relation rel)
1036 {
1037         Oid                     rel_id = rel->rd_id;
1038         PgStat_TableEntry *useent;
1039         PgStat_MsgTabstat *tsmsg;
1040         int                     mb;
1041         int                     i;
1042
1043         /*
1044          * Initialize data not to count at all.
1045          */
1046         stats->tabentry = NULL;
1047         stats->no_stats = FALSE;
1048         stats->heap_scan_counted = FALSE;
1049         stats->index_scan_counted = FALSE;
1050
1051         if (pgStatSock < 0 ||
1052                 !(pgstat_collect_tuplelevel ||
1053                   pgstat_collect_blocklevel))
1054         {
1055                 stats->no_stats = TRUE;
1056                 return;
1057         }
1058
1059         /*
1060          * Search the already-used message slots for this relation.
1061          */
1062         for (mb = 0; mb < pgStatTabstatUsed; mb++)
1063         {
1064                 tsmsg = pgStatTabstatMessages[mb];
1065
1066                 for (i = tsmsg->m_nentries; --i >= 0;)
1067                 {
1068                         if (tsmsg->m_entry[i].t_id == rel_id)
1069                         {
1070                                 stats->tabentry = (void *) &(tsmsg->m_entry[i]);
1071                                 return;
1072                         }
1073                 }
1074
1075                 if (tsmsg->m_nentries >= PGSTAT_NUM_TABENTRIES)
1076                         continue;
1077
1078                 /*
1079                  * Not found, but found a message buffer with an empty slot
1080                  * instead. Fine, let's use this one.
1081                  */
1082                 i = tsmsg->m_nentries++;
1083                 useent = &tsmsg->m_entry[i];
1084                 MemSet(useent, 0, sizeof(PgStat_TableEntry));
1085                 useent->t_id = rel_id;
1086                 stats->tabentry = (void *) useent;
1087                 return;
1088         }
1089
1090         /*
1091          * If we ran out of message buffers, we just allocate more.
1092          */
1093         if (pgStatTabstatUsed >= pgStatTabstatAlloc)
1094                 more_tabstat_space();
1095
1096         /*
1097          * Use the first entry of the next message buffer.
1098          */
1099         mb = pgStatTabstatUsed++;
1100         tsmsg = pgStatTabstatMessages[mb];
1101         tsmsg->m_nentries = 1;
1102         useent = &tsmsg->m_entry[0];
1103         MemSet(useent, 0, sizeof(PgStat_TableEntry));
1104         useent->t_id = rel_id;
1105         stats->tabentry = (void *) useent;
1106 }
1107
1108
1109 /* ----------
1110  * pgstat_count_xact_commit() -
1111  *
1112  *      Called from access/transam/xact.c to count transaction commits.
1113  * ----------
1114  */
1115 void
1116 pgstat_count_xact_commit(void)
1117 {
1118         if (!(pgstat_collect_querystring ||
1119                   pgstat_collect_tuplelevel ||
1120                   pgstat_collect_blocklevel))
1121                 return;
1122
1123         pgStatXactCommit++;
1124
1125         /*
1126          * If there was no relation activity yet, just make one existing
1127          * message buffer used without slots, causing the next report to tell
1128          * new xact-counters.
1129          */
1130         if (pgStatTabstatAlloc == 0)
1131                 more_tabstat_space();
1132
1133         if (pgStatTabstatUsed == 0)
1134         {
1135                 pgStatTabstatUsed++;
1136                 pgStatTabstatMessages[0]->m_nentries = 0;
1137         }
1138 }
1139
1140
1141 /* ----------
1142  * pgstat_count_xact_rollback() -
1143  *
1144  *      Called from access/transam/xact.c to count transaction rollbacks.
1145  * ----------
1146  */
1147 void
1148 pgstat_count_xact_rollback(void)
1149 {
1150         if (!(pgstat_collect_querystring ||
1151                   pgstat_collect_tuplelevel ||
1152                   pgstat_collect_blocklevel))
1153                 return;
1154
1155         pgStatXactRollback++;
1156
1157         /*
1158          * If there was no relation activity yet, just make one existing
1159          * message buffer used without slots, causing the next report to tell
1160          * new xact-counters.
1161          */
1162         if (pgStatTabstatAlloc == 0)
1163                 more_tabstat_space();
1164
1165         if (pgStatTabstatUsed == 0)
1166         {
1167                 pgStatTabstatUsed++;
1168                 pgStatTabstatMessages[0]->m_nentries = 0;
1169         }
1170 }
1171
1172
1173 /* ----------
1174  * pgstat_fetch_stat_dbentry() -
1175  *
1176  *      Support function for the SQL-callable pgstat* functions. Returns
1177  *      the collected statistics for one database or NULL. NULL doesn't mean
1178  *      that the database doesn't exist, it is just not yet known by the
1179  *      collector, so the caller is better off to report ZERO instead.
1180  * ----------
1181  */
1182 PgStat_StatDBEntry *
1183 pgstat_fetch_stat_dbentry(Oid dbid)
1184 {
1185         /*
1186          * If not done for this transaction, read the statistics collector
1187          * stats file into some hash tables.
1188          */
1189         backend_read_statsfile();
1190
1191         /*
1192          * Lookup the requested database; return NULL if not found
1193          */
1194         return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
1195                                                                                           (void *) &dbid,
1196                                                                                           HASH_FIND, NULL);
1197 }
1198
1199
1200 /* ----------
1201  * pgstat_fetch_stat_tabentry() -
1202  *
1203  *      Support function for the SQL-callable pgstat* functions. Returns
1204  *      the collected statistics for one table or NULL. NULL doesn't mean
1205  *      that the table doesn't exist, it is just not yet known by the
1206  *      collector, so the caller is better off to report ZERO instead.
1207  * ----------
1208  */
1209 PgStat_StatTabEntry *
1210 pgstat_fetch_stat_tabentry(Oid relid)
1211 {
1212         PgStat_StatDBEntry *dbentry;
1213         PgStat_StatTabEntry *tabentry;
1214
1215         /*
1216          * If not done for this transaction, read the statistics collector
1217          * stats file into some hash tables.
1218          */
1219         backend_read_statsfile();
1220
1221         /*
1222          * Lookup our database.
1223          */
1224         dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
1225                                                                                                  (void *) &MyDatabaseId,
1226                                                                                                  HASH_FIND, NULL);
1227         if (dbentry == NULL)
1228                 return NULL;
1229
1230         /*
1231          * Now inside the DB's table hash table lookup the requested one.
1232          */
1233         if (dbentry->tables == NULL)
1234                 return NULL;
1235         tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
1236                                                                                                    (void *) &relid,
1237                                                                                                    HASH_FIND, NULL);
1238         if (tabentry == NULL)
1239                 return NULL;
1240
1241         return tabentry;
1242 }
1243
1244
1245 /* ----------
1246  * pgstat_fetch_stat_beentry() -
1247  *
1248  *      Support function for the SQL-callable pgstat* functions. Returns
1249  *      the actual activity slot of one active backend. The caller is
1250  *      responsible for a check if the actual user is permitted to see
1251  *      that info (especially the querystring).
1252  * ----------
1253  */
1254 PgStat_StatBeEntry *
1255 pgstat_fetch_stat_beentry(int beid)
1256 {
1257         backend_read_statsfile();
1258
1259         if (beid < 1 || beid > pgStatNumBackends)
1260                 return NULL;
1261
1262         return &pgStatBeTable[beid - 1];
1263 }
1264
1265
1266 /* ----------
1267  * pgstat_fetch_stat_numbackends() -
1268  *
1269  *      Support function for the SQL-callable pgstat* functions. Returns
1270  *      the maximum current backend id.
1271  * ----------
1272  */
1273 int
1274 pgstat_fetch_stat_numbackends(void)
1275 {
1276         backend_read_statsfile();
1277
1278         return pgStatNumBackends;
1279 }
1280
1281
1282
1283 /* ------------------------------------------------------------
1284  * Local support functions follow
1285  * ------------------------------------------------------------
1286  */
1287
1288
1289 /* ----------
1290  * pgstat_setheader() -
1291  *
1292  *              Set common header fields in a statistics message
1293  * ----------
1294  */
1295 static void
1296 pgstat_setheader(PgStat_MsgHdr *hdr, int mtype)
1297 {
1298         hdr->m_type = mtype;
1299         hdr->m_backendid = MyBackendId;
1300         hdr->m_procpid = MyProcPid;
1301 }
1302
1303
1304 /* ----------
1305  * pgstat_send() -
1306  *
1307  *              Send out one statistics message to the collector
1308  * ----------
1309  */
1310 static void
1311 pgstat_send(void *msg, int len)
1312 {
1313         if (pgStatSock < 0)
1314                 return;
1315
1316         ((PgStat_MsgHdr *) msg)->m_size = len;
1317
1318         send(pgStatSock, msg, len, 0);
1319         /* We deliberately ignore any error from send() */
1320 }
1321
1322
1323 /* ----------
1324  * PgstatBufferMain() -
1325  *
1326  *      Start up the statistics buffer process.  This is the body of the
1327  *      postmaster child process.
1328  *
1329  *      The argc/argv parameters are valid only in EXEC_BACKEND case.
1330  * ----------
1331  */
1332 NON_EXEC_STATIC void
1333 PgstatBufferMain(int argc, char *argv[])
1334 {
1335         IsUnderPostmaster = true;       /* we are a postmaster subprocess now */
1336
1337         MyProcPid = getpid();           /* reset MyProcPid */
1338
1339         /* Lose the postmaster's on-exit routines */
1340         on_exit_reset();
1341
1342         /*
1343          * Ignore all signals usually bound to some action in the postmaster,
1344          * except for SIGCHLD and SIGQUIT --- see pgstat_recvbuffer.
1345          */
1346         pqsignal(SIGHUP, SIG_IGN);
1347         pqsignal(SIGINT, SIG_IGN);
1348         pqsignal(SIGTERM, SIG_IGN);
1349         pqsignal(SIGQUIT, pgstat_exit);
1350         pqsignal(SIGALRM, SIG_IGN);
1351         pqsignal(SIGPIPE, SIG_IGN);
1352         pqsignal(SIGUSR1, SIG_IGN);
1353         pqsignal(SIGUSR2, SIG_IGN);
1354         pqsignal(SIGCHLD, pgstat_die);
1355         pqsignal(SIGTTIN, SIG_DFL);
1356         pqsignal(SIGTTOU, SIG_DFL);
1357         pqsignal(SIGCONT, SIG_DFL);
1358         pqsignal(SIGWINCH, SIG_DFL);
1359         /* unblock will happen in pgstat_recvbuffer */
1360
1361 #ifdef EXEC_BACKEND
1362         pgstat_parseArgs(argc, argv);
1363 #endif
1364
1365         /*
1366          * Start a buffering process to read from the socket, so we have a
1367          * little more time to process incoming messages.
1368          *
1369          * NOTE: the process structure is: postmaster is parent of buffer process
1370          * is parent of collector process.      This way, the buffer can detect
1371          * collector failure via SIGCHLD, whereas otherwise it wouldn't notice
1372          * collector failure until it tried to write on the pipe.  That would
1373          * mean that after the postmaster started a new collector, we'd have
1374          * two buffer processes competing to read from the UDP socket --- not
1375          * good.
1376          */
1377         if (pgpipe(pgStatPipe) < 0)
1378                 ereport(ERROR,
1379                                 (errcode_for_socket_access(),
1380                          errmsg("could not create pipe for statistics buffer: %m")));
1381
1382         /* child becomes collector process */
1383 #ifdef EXEC_BACKEND
1384         pgStatCollectorPid = pgstat_forkexec(STAT_PROC_COLLECTOR);
1385 #else
1386         pgStatCollectorPid = fork();
1387 #endif
1388         switch (pgStatCollectorPid)
1389         {
1390                 case -1:
1391                         ereport(ERROR,
1392                                         (errmsg("could not fork statistics collector: %m")));
1393
1394 #ifndef EXEC_BACKEND
1395                 case 0:
1396                         /* child becomes collector process */
1397                         PgstatCollectorMain(0, NULL);
1398                         break;
1399 #endif
1400
1401                 default:
1402                         /* parent becomes buffer process */
1403                         closesocket(pgStatPipe[0]);
1404                         pgstat_recvbuffer();
1405         }
1406         exit(0);
1407 }
1408
1409
1410 /* ----------
1411  * PgstatCollectorMain() -
1412  *
1413  *      Start up the statistics collector itself.  This is the body of the
1414  *      postmaster grandchild process.
1415  *
1416  *      The argc/argv parameters are valid only in EXEC_BACKEND case.
1417  * ----------
1418  */
1419 NON_EXEC_STATIC void
1420 PgstatCollectorMain(int argc, char *argv[])
1421 {
1422         PgStat_Msg      msg;
1423         fd_set          rfds;
1424         int                     readPipe;
1425         int                     nready;
1426         int                     len = 0;
1427         struct timeval timeout;
1428         struct timeval next_statwrite;
1429         bool            need_statwrite;
1430         HASHCTL         hash_ctl;
1431
1432         MyProcPid = getpid();           /* reset MyProcPid */
1433
1434         /*
1435          * Reset signal handling.  With the exception of restoring default
1436          * SIGCHLD and SIGQUIT handling, this is a no-op in the
1437          * non-EXEC_BACKEND case because we'll have inherited these settings
1438          * from the buffer process; but it's not a no-op for EXEC_BACKEND.
1439          */
1440         pqsignal(SIGHUP, SIG_IGN);
1441         pqsignal(SIGINT, SIG_IGN);
1442         pqsignal(SIGTERM, SIG_IGN);
1443 #ifndef WIN32
1444         pqsignal(SIGQUIT, SIG_IGN);
1445 #else
1446         /* kluge to allow buffer process to kill collector; FIXME */
1447         pqsignal(SIGQUIT, pgstat_exit);
1448 #endif
1449         pqsignal(SIGALRM, SIG_IGN);
1450         pqsignal(SIGPIPE, SIG_IGN);
1451         pqsignal(SIGUSR1, SIG_IGN);
1452         pqsignal(SIGUSR2, SIG_IGN);
1453         pqsignal(SIGCHLD, SIG_DFL);
1454         pqsignal(SIGTTIN, SIG_DFL);
1455         pqsignal(SIGTTOU, SIG_DFL);
1456         pqsignal(SIGCONT, SIG_DFL);
1457         pqsignal(SIGWINCH, SIG_DFL);
1458         PG_SETMASK(&UnBlockSig);
1459
1460 #ifdef EXEC_BACKEND
1461         pgstat_parseArgs(argc, argv);
1462 #endif
1463
1464         /* Close unwanted files */
1465         closesocket(pgStatPipe[1]);
1466         closesocket(pgStatSock);
1467
1468         /*
1469          * Identify myself via ps
1470          */
1471         init_ps_display("stats collector process", "", "");
1472         set_ps_display("");
1473
1474         /*
1475          * Initialize filenames needed for status reports.
1476          */
1477         snprintf(pgStat_fname, MAXPGPATH, PGSTAT_STAT_FILENAME, DataDir);
1478         /* tmpfname need only be set correctly in this process */
1479         snprintf(pgStat_tmpfname, MAXPGPATH, PGSTAT_STAT_TMPFILE,
1480                          DataDir, (int)getpid());
1481
1482         /*
1483          * Arrange to write the initial status file right away
1484          */
1485         gettimeofday(&next_statwrite, NULL);
1486         need_statwrite = TRUE;
1487
1488         /*
1489          * Read in an existing statistics stats file or initialize the stats
1490          * to zero.
1491          */
1492         pgStatRunningInCollector = TRUE;
1493         pgstat_read_statsfile(&pgStatDBHash, InvalidOid, NULL, NULL);
1494
1495         /*
1496          * Create the dead backend hashtable
1497          */
1498         memset(&hash_ctl, 0, sizeof(hash_ctl));
1499         hash_ctl.keysize = sizeof(int);
1500         hash_ctl.entrysize = sizeof(PgStat_StatBeDead);
1501         hash_ctl.hash = tag_hash;
1502         pgStatBeDead = hash_create("Dead Backends", PGSTAT_BE_HASH_SIZE,
1503                                                            &hash_ctl, HASH_ELEM | HASH_FUNCTION);
1504
1505         /*
1506          * Create the known backends table
1507          */
1508         pgStatBeTable = (PgStat_StatBeEntry *) palloc0(
1509                                                            sizeof(PgStat_StatBeEntry) * MaxBackends);
1510
1511         readPipe = pgStatPipe[0];
1512
1513         /*
1514          * Process incoming messages and handle all the reporting stuff until
1515          * there are no more messages.
1516          */
1517         for (;;)
1518         {
1519                 /*
1520                  * If we need to write the status file again (there have been
1521                  * changes in the statistics since we wrote it last) calculate the
1522                  * timeout until we have to do so.
1523                  */
1524                 if (need_statwrite)
1525                 {
1526                         struct timeval now;
1527
1528                         gettimeofday(&now, NULL);
1529                         /* avoid assuming that tv_sec is signed */
1530                         if (now.tv_sec > next_statwrite.tv_sec ||
1531                                 (now.tv_sec == next_statwrite.tv_sec &&
1532                                  now.tv_usec >= next_statwrite.tv_usec))
1533                         {
1534                                 timeout.tv_sec = 0;
1535                                 timeout.tv_usec = 0;
1536                         }
1537                         else
1538                         {
1539                                 timeout.tv_sec = next_statwrite.tv_sec - now.tv_sec;
1540                                 timeout.tv_usec = next_statwrite.tv_usec - now.tv_usec;
1541                                 if (timeout.tv_usec < 0)
1542                                 {
1543                                         timeout.tv_sec--;
1544                                         timeout.tv_usec += 1000000;
1545                                 }
1546                         }
1547                 }
1548
1549                 /*
1550                  * Setup the descriptor set for select(2)
1551                  */
1552                 FD_ZERO(&rfds);
1553                 FD_SET(readPipe, &rfds);
1554
1555                 /*
1556                  * Now wait for something to do.
1557                  */
1558                 nready = select(readPipe + 1, &rfds, NULL, NULL,
1559                                                 (need_statwrite) ? &timeout : NULL);
1560                 if (nready < 0)
1561                 {
1562                         if (errno == EINTR)
1563                                 continue;
1564                         ereport(ERROR,
1565                                         (errcode_for_socket_access(),
1566                                  errmsg("select() failed in statistics collector: %m")));
1567                 }
1568
1569                 /*
1570                  * If there are no descriptors ready, our timeout for writing the
1571                  * stats file happened.
1572                  */
1573                 if (nready == 0)
1574                 {
1575                         pgstat_write_statsfile();
1576                         need_statwrite = FALSE;
1577
1578                         continue;
1579                 }
1580
1581                 /*
1582                  * Check if there is a new statistics message to collect.
1583                  */
1584                 if (FD_ISSET(readPipe, &rfds))
1585                 {
1586                         /*
1587                          * We may need to issue multiple read calls in case the buffer
1588                          * process didn't write the message in a single write, which
1589                          * is possible since it dumps its buffer bytewise. In any
1590                          * case, we'd need two reads since we don't know the message
1591                          * length initially.
1592                          */
1593                         int                     nread = 0;
1594                         int                     targetlen = sizeof(PgStat_MsgHdr);              /* initial */
1595                         bool            pipeEOF = false;
1596
1597                         while (nread < targetlen)
1598                         {
1599                                 len = piperead(readPipe, ((char *) &msg) + nread,
1600                                                            targetlen - nread);
1601                                 if (len < 0)
1602                                 {
1603                                         if (errno == EINTR)
1604                                                 continue;
1605                                         ereport(ERROR,
1606                                                         (errcode_for_socket_access(),
1607                                                          errmsg("could not read from statistics collector pipe: %m")));
1608                                 }
1609                                 if (len == 0)   /* EOF on the pipe! */
1610                                 {
1611                                         pipeEOF = true;
1612                                         break;
1613                                 }
1614                                 nread += len;
1615                                 if (nread == sizeof(PgStat_MsgHdr))
1616                                 {
1617                                         /* we have the header, compute actual msg length */
1618                                         targetlen = msg.msg_hdr.m_size;
1619                                         if (targetlen < (int) sizeof(PgStat_MsgHdr) ||
1620                                                 targetlen > (int) sizeof(msg))
1621                                         {
1622                                                 /*
1623                                                  * Bogus message length implies that we got out of
1624                                                  * sync with the buffer process somehow. Abort so
1625                                                  * that we can restart both processes.
1626                                                  */
1627                                                 ereport(ERROR,
1628                                                   (errmsg("invalid statistics message length")));
1629                                         }
1630                                 }
1631                         }
1632
1633                         /*
1634                          * EOF on the pipe implies that the buffer process exited.
1635                          * Fall out of outer loop.
1636                          */
1637                         if (pipeEOF)
1638                                 break;
1639
1640                         /*
1641                          * Distribute the message to the specific function handling
1642                          * it.
1643                          */
1644                         switch (msg.msg_hdr.m_type)
1645                         {
1646                                 case PGSTAT_MTYPE_DUMMY:
1647                                         break;
1648
1649                                 case PGSTAT_MTYPE_BESTART:
1650                                         pgstat_recv_bestart((PgStat_MsgBestart *) &msg, nread);
1651                                         break;
1652
1653                                 case PGSTAT_MTYPE_BETERM:
1654                                         pgstat_recv_beterm((PgStat_MsgBeterm *) &msg, nread);
1655                                         break;
1656
1657                                 case PGSTAT_MTYPE_TABSTAT:
1658                                         pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, nread);
1659                                         break;
1660
1661                                 case PGSTAT_MTYPE_TABPURGE:
1662                                         pgstat_recv_tabpurge((PgStat_MsgTabpurge *) &msg, nread);
1663                                         break;
1664
1665                                 case PGSTAT_MTYPE_ACTIVITY:
1666                                         pgstat_recv_activity((PgStat_MsgActivity *) &msg, nread);
1667                                         break;
1668
1669                                 case PGSTAT_MTYPE_DROPDB:
1670                                         pgstat_recv_dropdb((PgStat_MsgDropdb *) &msg, nread);
1671                                         break;
1672
1673                                 case PGSTAT_MTYPE_RESETCOUNTER:
1674                                         pgstat_recv_resetcounter((PgStat_MsgResetcounter *) &msg,
1675                                                                                          nread);
1676                                         break;
1677
1678                                 default:
1679                                         break;
1680                         }
1681
1682                         /*
1683                          * Globally count messages.
1684                          */
1685                         pgStatNumMessages++;
1686
1687                         /*
1688                          * If this is the first message after we wrote the stats file
1689                          * the last time, setup the timeout that it'd be written.
1690                          */
1691                         if (!need_statwrite)
1692                         {
1693                                 gettimeofday(&next_statwrite, NULL);
1694                                 next_statwrite.tv_usec += ((PGSTAT_STAT_INTERVAL) * 1000);
1695                                 next_statwrite.tv_sec += (next_statwrite.tv_usec / 1000000);
1696                                 next_statwrite.tv_usec %= 1000000;
1697                                 need_statwrite = TRUE;
1698                         }
1699                 }
1700
1701                 /*
1702                  * Note that we do NOT check for postmaster exit inside the loop;
1703                  * only EOF on the buffer pipe causes us to fall out.  This
1704                  * ensures we don't exit prematurely if there are still a few
1705                  * messages in the buffer or pipe at postmaster shutdown.
1706                  */
1707         }
1708
1709         /*
1710          * Okay, we saw EOF on the buffer pipe, so there are no more messages
1711          * to process.  If the buffer process quit because of postmaster
1712          * shutdown, we want to save the final stats to reuse at next startup.
1713          * But if the buffer process failed, it seems best not to (there may
1714          * even now be a new collector firing up, and we don't want it to read
1715          * a partially-rewritten stats file).
1716          */
1717         if (!PostmasterIsAlive(false))
1718                 pgstat_write_statsfile();
1719 }
1720
1721
1722 /* ----------
1723  * pgstat_recvbuffer() -
1724  *
1725  *      This is the body of the separate buffering process. Its only
1726  *      purpose is to receive messages from the UDP socket as fast as
1727  *      possible and forward them over a pipe into the collector itself.
1728  *      If the collector is slow to absorb messages, they are buffered here.
1729  * ----------
1730  */
1731 static void
1732 pgstat_recvbuffer(void)
1733 {
1734         fd_set          rfds;
1735         fd_set          wfds;
1736         struct timeval timeout;
1737         int                     writePipe = pgStatPipe[1];
1738         int                     maxfd;
1739         int                     nready;
1740         int                     len;
1741         int                     xfr;
1742         int                     frm;
1743         PgStat_Msg      input_buffer;
1744         char       *msgbuffer;
1745         int                     msg_send = 0;   /* next send index in buffer */
1746         int                     msg_recv = 0;   /* next receive index */
1747         int                     msg_have = 0;   /* number of bytes stored */
1748         bool            overflow = false;
1749
1750         /*
1751          * Identify myself via ps
1752          */
1753         init_ps_display("stats buffer process", "", "");
1754         set_ps_display("");
1755
1756         /*
1757          * We want to die if our child collector process does.  There are two
1758          * ways we might notice that it has died: receive SIGCHLD, or get a
1759          * write failure on the pipe leading to the child.      We can set SIGPIPE
1760          * to kill us here.  Our SIGCHLD handler was already set up before we
1761          * forked (must do it that way, else it's a race condition).
1762          */
1763         pqsignal(SIGPIPE, SIG_DFL);
1764         PG_SETMASK(&UnBlockSig);
1765
1766         /*
1767          * Set the write pipe to nonblock mode, so that we cannot block when
1768          * the collector falls behind.
1769          */
1770         if (!pg_set_noblock(writePipe))
1771                 ereport(ERROR,
1772                                 (errcode_for_socket_access(),
1773                                  errmsg("could not set statistics collector pipe to nonblocking mode: %m")));
1774
1775         /*
1776          * Allocate the message buffer
1777          */
1778         msgbuffer = (char *) palloc(PGSTAT_RECVBUFFERSZ);
1779
1780         /*
1781          * Loop forever
1782          */
1783         for (;;)
1784         {
1785                 FD_ZERO(&rfds);
1786                 FD_ZERO(&wfds);
1787                 maxfd = -1;
1788
1789                 /*
1790                  * As long as we have buffer space we add the socket to the read
1791                  * descriptor set.
1792                  */
1793                 if (msg_have <= (int) (PGSTAT_RECVBUFFERSZ - sizeof(PgStat_Msg)))
1794                 {
1795                         FD_SET(pgStatSock, &rfds);
1796                         maxfd = pgStatSock;
1797                         overflow = false;
1798                 }
1799                 else
1800                 {
1801                         if (!overflow)
1802                         {
1803                                 ereport(LOG,
1804                                                 (errmsg("statistics buffer is full")));
1805                                 overflow = true;
1806                         }
1807                 }
1808
1809                 /*
1810                  * If we have messages to write out, we add the pipe to the write
1811                  * descriptor set.
1812                  */
1813                 if (msg_have > 0)
1814                 {
1815                         FD_SET(writePipe, &wfds);
1816                         if (writePipe > maxfd)
1817                                 maxfd = writePipe;
1818                 }
1819
1820                 /*
1821                  * Wait for some work to do; but not for more than 10 seconds.
1822                  * (This determines how quickly we will shut down after an
1823                  * ungraceful postmaster termination; so it needn't be very fast.)
1824                  */
1825                 timeout.tv_sec = 10;
1826                 timeout.tv_usec = 0;
1827
1828                 nready = select(maxfd + 1, &rfds, &wfds, NULL, &timeout);
1829                 if (nready < 0)
1830                 {
1831                         if (errno == EINTR)
1832                                 continue;
1833                         ereport(ERROR,
1834                                         (errcode_for_socket_access(),
1835                                          errmsg("select() failed in statistics buffer: %m")));
1836                 }
1837
1838                 /*
1839                  * If there is a message on the socket, read it and check for
1840                  * validity.
1841                  */
1842                 if (FD_ISSET(pgStatSock, &rfds))
1843                 {
1844                         len = recv(pgStatSock, (char *) &input_buffer,
1845                                            sizeof(PgStat_Msg), 0);
1846                         if (len < 0)
1847                                 ereport(ERROR,
1848                                                 (errcode_for_socket_access(),
1849                                            errmsg("could not read statistics message: %m")));
1850
1851                         /*
1852                          * We ignore messages that are smaller than our common header
1853                          */
1854                         if (len < sizeof(PgStat_MsgHdr))
1855                                 continue;
1856
1857                         /*
1858                          * The received length must match the length in the header
1859                          */
1860                         if (input_buffer.msg_hdr.m_size != len)
1861                                 continue;
1862
1863                         /*
1864                          * O.K. - we accept this message.  Copy it to the circular
1865                          * msgbuffer.
1866                          */
1867                         frm = 0;
1868                         while (len > 0)
1869                         {
1870                                 xfr = PGSTAT_RECVBUFFERSZ - msg_recv;
1871                                 if (xfr > len)
1872                                         xfr = len;
1873                                 Assert(xfr > 0);
1874                                 memcpy(msgbuffer + msg_recv,
1875                                            ((char *) &input_buffer) + frm,
1876                                            xfr);
1877                                 msg_recv += xfr;
1878                                 if (msg_recv == PGSTAT_RECVBUFFERSZ)
1879                                         msg_recv = 0;
1880                                 msg_have += xfr;
1881                                 frm += xfr;
1882                                 len -= xfr;
1883                         }
1884                 }
1885
1886                 /*
1887                  * If the collector is ready to receive, write some data into his
1888                  * pipe.  We may or may not be able to write all that we have.
1889                  *
1890                  * NOTE: if what we have is less than PIPE_BUF bytes but more than
1891                  * the space available in the pipe buffer, most kernels will
1892                  * refuse to write any of it, and will return EAGAIN.  This means
1893                  * we will busy-loop until the situation changes (either because
1894                  * the collector caught up, or because more data arrives so that
1895                  * we have more than PIPE_BUF bytes buffered).  This is not good,
1896                  * but is there any way around it?      We have no way to tell when
1897                  * the collector has caught up...
1898                  */
1899                 if (FD_ISSET(writePipe, &wfds))
1900                 {
1901                         xfr = PGSTAT_RECVBUFFERSZ - msg_send;
1902                         if (xfr > msg_have)
1903                                 xfr = msg_have;
1904                         Assert(xfr > 0);
1905                         len = pipewrite(writePipe, msgbuffer + msg_send, xfr);
1906                         if (len < 0)
1907                         {
1908                                 if (errno == EINTR || errno == EAGAIN)
1909                                         continue;       /* not enough space in pipe */
1910                                 ereport(ERROR,
1911                                                 (errcode_for_socket_access(),
1912                                                  errmsg("could not write to statistics collector pipe: %m")));
1913                         }
1914                         /* NB: len < xfr is okay */
1915                         msg_send += len;
1916                         if (msg_send == PGSTAT_RECVBUFFERSZ)
1917                                 msg_send = 0;
1918                         msg_have -= len;
1919                 }
1920
1921                 /*
1922                  * Make sure we forwarded all messages before we check for
1923                  * postmaster termination.
1924                  */
1925                 if (msg_have != 0 || FD_ISSET(pgStatSock, &rfds))
1926                         continue;
1927
1928                 /*
1929                  * If the postmaster has terminated, we die too.  (This is no
1930                  * longer the normal exit path, however.)
1931                  */
1932                 if (!PostmasterIsAlive(true))
1933                         exit(0);
1934         }
1935 }
1936
1937 /* SIGQUIT signal handler for buffer process */
1938 static void
1939 pgstat_exit(SIGNAL_ARGS)
1940 {
1941         /*
1942          * For now, we just nail the doors shut and get out of town.  It might
1943          * be cleaner to allow any pending messages to be sent, but that
1944          * creates a tradeoff against speed of exit.
1945          */
1946
1947         /*
1948          * If running in bufferer, kill our collector as well. On some broken
1949          * win32 systems, it does not shut down automatically because of issues
1950          * with socket inheritance.  XXX so why not fix the socket inheritance...
1951          */
1952 #ifdef WIN32
1953         if (pgStatCollectorPid > 0)
1954                 kill(pgStatCollectorPid, SIGQUIT);
1955 #endif
1956         exit(0);
1957 }
1958
1959 /* SIGCHLD signal handler for buffer process */
1960 static void
1961 pgstat_die(SIGNAL_ARGS)
1962 {
1963         exit(1);
1964 }
1965
1966
1967 /* ----------
1968  * pgstat_add_backend() -
1969  *
1970  *      Support function to keep our backend list up to date.
1971  * ----------
1972  */
1973 static int
1974 pgstat_add_backend(PgStat_MsgHdr *msg)
1975 {
1976         PgStat_StatBeEntry *beentry;
1977         PgStat_StatBeDead *deadbe;
1978
1979         /*
1980          * Check that the backend ID is valid
1981          */
1982         if (msg->m_backendid < 1 || msg->m_backendid > MaxBackends)
1983         {
1984                 ereport(LOG,
1985                          (errmsg("invalid server process ID %d", msg->m_backendid)));
1986                 return -1;
1987         }
1988
1989         /*
1990          * Get the slot for this backendid.
1991          */
1992         beentry = &pgStatBeTable[msg->m_backendid - 1];
1993
1994         /*
1995          * If the slot contains the PID of this backend, everything is
1996          * fine and we have nothing to do. Note that all the slots are
1997          * zero'd out when the collector is started. We assume that a slot
1998          * is "empty" iff procpid == 0.
1999          */
2000         if (beentry->procpid > 0 && beentry->procpid == msg->m_procpid)
2001                 return 0;
2002
2003         /*
2004          * Lookup if this backend is known to be dead. This can be caused due
2005          * to messages arriving in the wrong order - e.g. postmaster's BETERM
2006          * message might have arrived before we received all the backends
2007          * stats messages, or even a new backend with the same backendid was
2008          * faster in sending his BESTART.
2009          *
2010          * If the backend is known to be dead, we ignore this add.
2011          */
2012         deadbe = (PgStat_StatBeDead *) hash_search(pgStatBeDead,
2013                                                                                            (void *) &(msg->m_procpid),
2014                                                                                            HASH_FIND, NULL);
2015         if (deadbe)
2016                 return 1;
2017
2018         /*
2019          * Backend isn't known to be dead. If it's slot is currently used, we
2020          * have to kick out the old backend.
2021          */
2022         if (beentry->procpid > 0)
2023                 pgstat_sub_backend(beentry->procpid);
2024
2025         /* Must be able to distinguish between empty and non-empty slots */
2026         Assert(msg->m_procpid > 0);
2027
2028         /* Put this new backend into the slot */
2029         beentry->procpid = msg->m_procpid;
2030         beentry->start_sec = 
2031                 GetCurrentAbsoluteTimeUsec(&beentry->start_usec);
2032         beentry->activity_start_sec = 0;
2033         beentry->activity_start_usec = 0;
2034         beentry->activity[0] = '\0';
2035
2036         /*
2037          * We can't initialize the rest of the data in this slot until we
2038          * see the BESTART message. Therefore, we set the database and
2039          * user to sentinel values, to indicate "undefined". There is no
2040          * easy way to do this for the client address, so make sure to
2041          * check that the database or user are defined before accessing
2042          * the client address.
2043          */
2044         beentry->userid = InvalidOid;
2045         beentry->databaseid = InvalidOid;
2046
2047         return 0;
2048 }
2049
2050 /*
2051  * Lookup the hash table entry for the specified database. If no hash
2052  * table entry exists, initialize it.
2053  */
2054 static PgStat_StatDBEntry *
2055 pgstat_get_db_entry(Oid databaseid)
2056 {
2057         PgStat_StatDBEntry *result;
2058         bool found;
2059
2060         /* Lookup or create the hash table entry for this database */
2061         result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
2062                                                                                                 &databaseid,
2063                                                                                                 HASH_ENTER, &found);
2064
2065         /* If not found, initialize the new one. */
2066         if (!found)
2067         {
2068                 HASHCTL         hash_ctl;
2069
2070                 result->tables = NULL;
2071                 result->n_xact_commit = 0;
2072                 result->n_xact_rollback = 0;
2073                 result->n_blocks_fetched = 0;
2074                 result->n_blocks_hit = 0;
2075                 result->destroy = 0;
2076
2077                 memset(&hash_ctl, 0, sizeof(hash_ctl));
2078                 hash_ctl.keysize = sizeof(Oid);
2079                 hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
2080                 hash_ctl.hash = oid_hash;
2081                 result->tables = hash_create("Per-database table",
2082                                                                           PGSTAT_TAB_HASH_SIZE,
2083                                                                           &hash_ctl,
2084                                                                           HASH_ELEM | HASH_FUNCTION);
2085         }
2086
2087         return result;
2088 }
2089
2090 /* ----------
2091  * pgstat_sub_backend() -
2092  *
2093  *      Remove a backend from the actual backends list.
2094  * ----------
2095  */
2096 static void
2097 pgstat_sub_backend(int procpid)
2098 {
2099         int                     i;
2100         PgStat_StatBeDead *deadbe;
2101         bool            found;
2102
2103         /*
2104          * Search in the known-backends table for the slot containing this
2105          * PID.
2106          */
2107         for (i = 0; i < MaxBackends; i++)
2108         {
2109                 if (pgStatBeTable[i].procpid == procpid)
2110                 {
2111                         /*
2112                          * That's him. Add an entry to the known to be dead backends.
2113                          * Due to possible misorder in the arrival of UDP packets it's
2114                          * possible that even if we know the backend is dead, there
2115                          * could still be messages queued that arrive later. Those
2116                          * messages must not cause our number of backends statistics
2117                          * to get screwed up, so we remember for a couple of seconds
2118                          * that this PID is dead and ignore them (only the counting of
2119                          * backends, not the table access stats they sent).
2120                          */
2121                         deadbe = (PgStat_StatBeDead *) hash_search(pgStatBeDead,
2122                                                                                                            (void *) &procpid,
2123                                                                                                            HASH_ENTER,
2124                                                                                                            &found);
2125
2126                         if (!found)
2127                         {
2128                                 deadbe->backendid = i + 1;
2129                                 deadbe->destroy = PGSTAT_DESTROY_COUNT;
2130                         }
2131
2132                         /*
2133                          * Declare the backend slot empty.
2134                          */
2135                         pgStatBeTable[i].procpid = 0;
2136                         return;
2137                 }
2138         }
2139
2140         /*
2141          * No big problem if not found. This can happen if UDP messages arrive
2142          * out of order here.
2143          */
2144 }
2145
2146
2147 /* ----------
2148  * pgstat_write_statsfile() -
2149  *
2150  *      Tell the news.
2151  * ----------
2152  */
2153 static void
2154 pgstat_write_statsfile(void)
2155 {
2156         HASH_SEQ_STATUS hstat;
2157         HASH_SEQ_STATUS tstat;
2158         PgStat_StatDBEntry *dbentry;
2159         PgStat_StatTabEntry *tabentry;
2160         PgStat_StatBeDead *deadbe;
2161         FILE       *fpout;
2162         int                     i;
2163
2164         /*
2165          * Open the statistics temp file to write out the current values.
2166          */
2167         fpout = fopen(pgStat_tmpfname, PG_BINARY_W);
2168         if (fpout == NULL)
2169         {
2170                 ereport(LOG,
2171                                 (errcode_for_file_access(),
2172                         errmsg("could not open temporary statistics file \"%s\": %m",
2173                                    pgStat_tmpfname)));
2174                 return;
2175         }
2176
2177         /*
2178          * Walk through the database table.
2179          */
2180         hash_seq_init(&hstat, pgStatDBHash);
2181         while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
2182         {
2183                 /*
2184                  * If this database is marked destroyed, count down and do so if
2185                  * it reaches 0.
2186                  */
2187                 if (dbentry->destroy > 0)
2188                 {
2189                         if (--(dbentry->destroy) == 0)
2190                         {
2191                                 if (dbentry->tables != NULL)
2192                                         hash_destroy(dbentry->tables);
2193
2194                                 if (hash_search(pgStatDBHash,
2195                                                                 (void *) &(dbentry->databaseid),
2196                                                                 HASH_REMOVE, NULL) == NULL)
2197                                         ereport(ERROR,
2198                                                         (errmsg("database hash table corrupted "
2199                                                                         "during cleanup --- abort")));
2200                         }
2201
2202                         /*
2203                          * Don't include statistics for it.
2204                          */
2205                         continue;
2206                 }
2207
2208                 /*
2209                  * Write out the DB line including the number of life backends.
2210                  */
2211                 fputc('D', fpout);
2212                 fwrite(dbentry, sizeof(PgStat_StatDBEntry), 1, fpout);
2213
2214                 /*
2215                  * Walk through the databases access stats per table.
2216                  */
2217                 hash_seq_init(&tstat, dbentry->tables);
2218                 while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
2219                 {
2220                         /*
2221                          * If table entry marked for destruction, same as above for
2222                          * the database entry.
2223                          */
2224                         if (tabentry->destroy > 0)
2225                         {
2226                                 if (--(tabentry->destroy) == 0)
2227                                 {
2228                                         if (hash_search(dbentry->tables,
2229                                                                         (void *) &(tabentry->tableid),
2230                                                                         HASH_REMOVE, NULL) == NULL)
2231                                         {
2232                                                 ereport(ERROR,
2233                                                                 (errmsg("tables hash table for "
2234                                                                                 "database %u corrupted during "
2235                                                                                 "cleanup --- abort",
2236                                                                                 dbentry->databaseid)));
2237                                         }
2238                                 }
2239                                 continue;
2240                         }
2241
2242                         /*
2243                          * At least we think this is still a life table. Print it's
2244                          * access stats.
2245                          */
2246                         fputc('T', fpout);
2247                         fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
2248                 }
2249
2250                 /*
2251                  * Mark the end of this DB
2252                  */
2253                 fputc('d', fpout);
2254         }
2255
2256         /*
2257          * Write out the known running backends to the stats file.
2258          */
2259         i = MaxBackends;
2260         fputc('M', fpout);
2261         fwrite(&i, sizeof(i), 1, fpout);
2262
2263         for (i = 0; i < MaxBackends; i++)
2264         {
2265                 if (pgStatBeTable[i].procpid > 0)
2266                 {
2267                         fputc('B', fpout);
2268                         fwrite(&pgStatBeTable[i], sizeof(PgStat_StatBeEntry), 1, fpout);
2269                 }
2270         }
2271
2272         /*
2273          * No more output to be done. Close the temp file and replace the old
2274          * pgstat.stat with it.
2275          */
2276         fputc('E', fpout);
2277         if (fclose(fpout) < 0)
2278         {
2279                 ereport(LOG,
2280                                 (errcode_for_file_access(),
2281                    errmsg("could not close temporary statistics file \"%s\": %m",
2282                                   pgStat_tmpfname)));
2283         }
2284         else
2285         {
2286                 if (rename(pgStat_tmpfname, pgStat_fname) < 0)
2287                 {
2288                         ereport(LOG,
2289                                         (errcode_for_file_access(),
2290                                          errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
2291                                                         pgStat_tmpfname, pgStat_fname)));
2292                 }
2293         }
2294
2295         /*
2296          * Clear out the dead backends table
2297          */
2298         hash_seq_init(&hstat, pgStatBeDead);
2299         while ((deadbe = (PgStat_StatBeDead *) hash_seq_search(&hstat)) != NULL)
2300         {
2301                 /*
2302                  * Count down the destroy delay and remove entries where it
2303                  * reaches 0.
2304                  */
2305                 if (--(deadbe->destroy) <= 0)
2306                 {
2307                         if (hash_search(pgStatBeDead,
2308                                                         (void *) &(deadbe->procpid),
2309                                                         HASH_REMOVE, NULL) == NULL)
2310                         {
2311                                 ereport(ERROR,
2312                                           (errmsg("dead-server-process hash table corrupted "
2313                                                           "during cleanup --- abort")));
2314                         }
2315                 }
2316         }
2317 }
2318
2319
2320 /* ----------
2321  * pgstat_read_statsfile() -
2322  *
2323  *      Reads in an existing statistics collector and initializes the
2324  *      databases hash table (who's entries point to the tables hash tables)
2325  *      and the current backend table.
2326  * ----------
2327  */
2328 static void
2329 pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
2330                                           PgStat_StatBeEntry **betab, int *numbackends)
2331 {
2332         PgStat_StatDBEntry *dbentry;
2333         PgStat_StatDBEntry dbbuf;
2334         PgStat_StatTabEntry *tabentry;
2335         PgStat_StatTabEntry tabbuf;
2336         HASHCTL         hash_ctl;
2337         HTAB       *tabhash = NULL;
2338         FILE       *fpin;
2339         int                     maxbackends = 0;
2340         int                     havebackends = 0;
2341         bool            found;
2342         MemoryContext use_mcxt;
2343         int                     mcxt_flags;
2344
2345         /*
2346          * If running in the collector we use the DynaHashCxt memory context.
2347          * If running in a backend, we use the TopTransactionContext instead,
2348          * so the caller must only know the last XactId when this call
2349          * happened to know if his tables are still valid or already gone!
2350          */
2351         if (pgStatRunningInCollector)
2352         {
2353                 use_mcxt = NULL;
2354                 mcxt_flags = 0;
2355         }
2356         else
2357         {
2358                 use_mcxt = TopTransactionContext;
2359                 mcxt_flags = HASH_CONTEXT;
2360         }
2361
2362         /*
2363          * Create the DB hashtable
2364          */
2365         memset(&hash_ctl, 0, sizeof(hash_ctl));
2366         hash_ctl.keysize = sizeof(Oid);
2367         hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
2368         hash_ctl.hash = oid_hash;
2369         hash_ctl.hcxt = use_mcxt;
2370         *dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
2371                                                   HASH_ELEM | HASH_FUNCTION | mcxt_flags);
2372
2373         /*
2374          * Initialize the number of known backends to zero, just in case we do
2375          * a silent error return below.
2376          */
2377         if (numbackends != NULL)
2378                 *numbackends = 0;
2379         if (betab != NULL)
2380                 *betab = NULL;
2381
2382         /*
2383          * In EXEC_BACKEND case, we won't have inherited pgStat_fname from
2384          * postmaster, so compute it first time through.
2385          */
2386 #ifdef EXEC_BACKEND
2387         if (pgStat_fname[0] == '\0')
2388         {
2389                 Assert(DataDir != NULL);
2390                 snprintf(pgStat_fname, MAXPGPATH, PGSTAT_STAT_FILENAME, DataDir);
2391         }
2392 #endif
2393
2394         /*
2395          * Try to open the status file. If it doesn't exist, the backends
2396          * simply return zero for anything and the collector simply starts
2397          * from scratch with empty counters.
2398          */
2399         if ((fpin = AllocateFile(pgStat_fname, PG_BINARY_R)) == NULL)
2400                 return;
2401
2402         /*
2403          * We found an existing collector stats file. Read it and put all the
2404          * hashtable entries into place.
2405          */
2406         for (;;)
2407         {
2408                 switch (fgetc(fpin))
2409                 {
2410                                 /*
2411                                  * 'D'  A PgStat_StatDBEntry struct describing a database
2412                                  * follows. Subsequently, zero to many 'T' entries will
2413                                  * follow until a 'd' is encountered.
2414                                  */
2415                         case 'D':
2416                                 if (fread(&dbbuf, 1, sizeof(dbbuf), fpin) != sizeof(dbbuf))
2417                                 {
2418                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
2419                                                         (errmsg("corrupted pgstat.stat file")));
2420                                         goto done;
2421                                 }
2422
2423                                 /*
2424                                  * Add to the DB hash
2425                                  */
2426                                 dbentry = (PgStat_StatDBEntry *) hash_search(*dbhash,
2427                                                                                           (void *) &dbbuf.databaseid,
2428                                                                                                                          HASH_ENTER,
2429                                                                                                                          &found);
2430                                 if (found)
2431                                 {
2432                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
2433                                                         (errmsg("corrupted pgstat.stat file")));
2434                                         goto done;
2435                                 }
2436
2437                                 memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
2438                                 dbentry->tables = NULL;
2439                                 dbentry->destroy = 0;
2440                                 dbentry->n_backends = 0;
2441
2442                                 /*
2443                                  * Don't collect tables if not the requested DB
2444                                  */
2445                                 if (onlydb != InvalidOid && onlydb != dbbuf.databaseid)
2446                                         break;
2447
2448                                 memset(&hash_ctl, 0, sizeof(hash_ctl));
2449                                 hash_ctl.keysize = sizeof(Oid);
2450                                 hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
2451                                 hash_ctl.hash = oid_hash;
2452                                 hash_ctl.hcxt = use_mcxt;
2453                                 dbentry->tables = hash_create("Per-database table",
2454                                                                                           PGSTAT_TAB_HASH_SIZE,
2455                                                                                           &hash_ctl,
2456                                                                                           HASH_ELEM | HASH_FUNCTION | mcxt_flags);
2457
2458                                 /*
2459                                  * Arrange that following 'T's add entries to this
2460                                  * databases tables hash table.
2461                                  */
2462                                 tabhash = dbentry->tables;
2463                                 break;
2464
2465                                 /*
2466                                  * 'd'  End of this database.
2467                                  */
2468                         case 'd':
2469                                 tabhash = NULL;
2470                                 break;
2471
2472                                 /*
2473                                  * 'T'  A PgStat_StatTabEntry follows.
2474                                  */
2475                         case 'T':
2476                                 if (fread(&tabbuf, 1, sizeof(tabbuf), fpin) != sizeof(tabbuf))
2477                                 {
2478                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
2479                                                         (errmsg("corrupted pgstat.stat file")));
2480                                         goto done;
2481                                 }
2482
2483                                 /*
2484                                  * Skip if table belongs to a not requested database.
2485                                  */
2486                                 if (tabhash == NULL)
2487                                         break;
2488
2489                                 tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
2490                                                                                                 (void *) &tabbuf.tableid,
2491                                                                                                          HASH_ENTER, &found);
2492
2493                                 if (found)
2494                                 {
2495                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
2496                                                         (errmsg("corrupted pgstat.stat file")));
2497                                         goto done;
2498                                 }
2499
2500                                 memcpy(tabentry, &tabbuf, sizeof(tabbuf));
2501                                 break;
2502
2503                                 /*
2504                                  * 'M'  The maximum number of backends to expect follows.
2505                                  */
2506                         case 'M':
2507                                 if (betab == NULL || numbackends == NULL)
2508                                         goto done;
2509                                 if (fread(&maxbackends, 1, sizeof(maxbackends), fpin) !=
2510                                         sizeof(maxbackends))
2511                                 {
2512                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
2513                                                         (errmsg("corrupted pgstat.stat file")));
2514                                         goto done;
2515                                 }
2516                                 if (maxbackends == 0)
2517                                         goto done;
2518
2519                                 /*
2520                                  * Allocate space (in TopTransactionContext too) for the
2521                                  * backend table.
2522                                  */
2523                                 if (use_mcxt == NULL)
2524                                         *betab = (PgStat_StatBeEntry *) palloc(
2525                                                            sizeof(PgStat_StatBeEntry) * maxbackends);
2526                                 else
2527                                         *betab = (PgStat_StatBeEntry *) MemoryContextAlloc(
2528                                                                                                                                 use_mcxt,
2529                                                            sizeof(PgStat_StatBeEntry) * maxbackends);
2530                                 break;
2531
2532                                 /*
2533                                  * 'B'  A PgStat_StatBeEntry follows.
2534                                  */
2535                         case 'B':
2536                                 if (betab == NULL || numbackends == NULL || *betab == NULL)
2537                                         goto done;
2538
2539                                 /*
2540                                  * Read it directly into the table.
2541                                  */
2542                                 if (fread(&(*betab)[havebackends], 1,
2543                                                   sizeof(PgStat_StatBeEntry), fpin) !=
2544                                         sizeof(PgStat_StatBeEntry))
2545                                 {
2546                                         ereport(pgStatRunningInCollector ? LOG : WARNING,
2547                                                         (errmsg("corrupted pgstat.stat file")));
2548                                         goto done;
2549                                 }
2550
2551                                 /*
2552                                  * Count backends per database here.
2553                                  */
2554                                 dbentry = (PgStat_StatDBEntry *) hash_search(*dbhash,
2555                                                    (void *) &((*betab)[havebackends].databaseid),
2556                                                                                                                 HASH_FIND, NULL);
2557                                 if (dbentry)
2558                                         dbentry->n_backends++;
2559
2560                                 havebackends++;
2561                                 if (numbackends != 0)
2562                                         *numbackends = havebackends;
2563                                 if (havebackends >= maxbackends)
2564                                         goto done;
2565
2566                                 break;
2567
2568                                 /*
2569                                  * 'E'  The EOF marker of a complete stats file.
2570                                  */
2571                         case 'E':
2572                                 goto done;
2573
2574                         default:
2575                                 ereport(pgStatRunningInCollector ? LOG : WARNING,
2576                                                 (errmsg("corrupted pgstat.stat file")));
2577                                 goto done;
2578                 }
2579         }
2580
2581 done:
2582         FreeFile(fpin);
2583 }
2584
2585 /*
2586  * If not done for this transaction, read the statistics collector
2587  * stats file into some hash tables.
2588  *
2589  * Because we store the hash tables in TopTransactionContext, the result
2590  * is good for the entire current main transaction.
2591  */
2592 static void
2593 backend_read_statsfile(void)
2594 {
2595         TransactionId topXid = GetTopTransactionId();
2596
2597         if (!TransactionIdEquals(pgStatDBHashXact, topXid))
2598         {
2599                 Assert(!pgStatRunningInCollector);
2600                 pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
2601                                                           &pgStatBeTable, &pgStatNumBackends);
2602                 pgStatDBHashXact = topXid;
2603         }
2604 }
2605
2606
2607 /* ----------
2608  * pgstat_recv_bestart() -
2609  *
2610  *      Process a backend startup message.
2611  * ----------
2612  */
2613 static void
2614 pgstat_recv_bestart(PgStat_MsgBestart *msg, int len)
2615 {
2616         PgStat_StatBeEntry *entry;
2617
2618         /*
2619          * If the backend is known dead, we ignore the message -- we don't
2620          * want to update the backend entry's state since this BESTART
2621          * message refers to an old, dead backend
2622          */
2623         if (pgstat_add_backend(&msg->m_hdr) != 0)
2624                 return;
2625
2626         entry = &(pgStatBeTable[msg->m_hdr.m_backendid - 1]);
2627         entry->userid = msg->m_userid;
2628         memcpy(&entry->clientaddr, &msg->m_clientaddr, sizeof(entry->clientaddr));
2629         entry->databaseid = msg->m_databaseid;
2630 }
2631
2632
2633 /* ----------
2634  * pgstat_recv_beterm() -
2635  *
2636  *      Process a backend termination message.
2637  * ----------
2638  */
2639 static void
2640 pgstat_recv_beterm(PgStat_MsgBeterm *msg, int len)
2641 {
2642         pgstat_sub_backend(msg->m_hdr.m_procpid);
2643 }
2644
2645
2646 /* ----------
2647  * pgstat_recv_activity() -
2648  *
2649  *      Remember what the backend is doing.
2650  * ----------
2651  */
2652 static void
2653 pgstat_recv_activity(PgStat_MsgActivity *msg, int len)
2654 {
2655         PgStat_StatBeEntry *entry;
2656
2657         /*
2658          * Here we check explicitly for 0 return, since we don't want to
2659          * mangle the activity of an active backend by a delayed packet from a
2660          * dead one.
2661          */
2662         if (pgstat_add_backend(&msg->m_hdr) != 0)
2663                 return;
2664
2665         entry = &(pgStatBeTable[msg->m_hdr.m_backendid - 1]);
2666
2667         StrNCpy(entry->activity, msg->m_what, PGSTAT_ACTIVITY_SIZE);
2668
2669         entry->activity_start_sec =
2670                 GetCurrentAbsoluteTimeUsec(&entry->activity_start_usec);
2671 }
2672
2673
2674 /* ----------
2675  * pgstat_recv_tabstat() -
2676  *
2677  *      Count what the backend has done.
2678  * ----------
2679  */
2680 static void
2681 pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
2682 {
2683         PgStat_TableEntry *tabmsg = &(msg->m_entry[0]);
2684         PgStat_StatDBEntry *dbentry;
2685         PgStat_StatTabEntry *tabentry;
2686         int                     i;
2687         bool            found;
2688
2689         /*
2690          * Make sure the backend is counted for.
2691          */
2692         if (pgstat_add_backend(&msg->m_hdr) < 0)
2693                 return;
2694
2695         dbentry = pgstat_get_db_entry(msg->m_databaseid);
2696
2697         /*
2698          * If the database is marked for destroy, this is a delayed UDP packet
2699          * and not worth being counted.
2700          */
2701         if (dbentry->destroy > 0)
2702                 return;
2703
2704         dbentry->n_xact_commit += (PgStat_Counter) (msg->m_xact_commit);
2705         dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback);
2706
2707         /*
2708          * Process all table entries in the message.
2709          */
2710         for (i = 0; i < msg->m_nentries; i++)
2711         {
2712                 tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
2713                                                                                           (void *) &(tabmsg[i].t_id),
2714                                                                                                          HASH_ENTER, &found);
2715
2716                 if (!found)
2717                 {
2718                         /*
2719                          * If it's a new table entry, initialize counters to the
2720                          * values we just got.
2721                          */
2722                         tabentry->numscans = tabmsg[i].t_numscans;
2723                         tabentry->tuples_returned = tabmsg[i].t_tuples_returned;
2724                         tabentry->tuples_fetched = tabmsg[i].t_tuples_fetched;
2725                         tabentry->tuples_inserted = tabmsg[i].t_tuples_inserted;
2726                         tabentry->tuples_updated = tabmsg[i].t_tuples_updated;
2727                         tabentry->tuples_deleted = tabmsg[i].t_tuples_deleted;
2728                         tabentry->blocks_fetched = tabmsg[i].t_blocks_fetched;
2729                         tabentry->blocks_hit = tabmsg[i].t_blocks_hit;
2730
2731                         tabentry->destroy = 0;
2732                 }
2733                 else
2734                 {
2735                         /*
2736                          * Otherwise add the values to the existing entry.
2737                          */
2738                         tabentry->numscans += tabmsg[i].t_numscans;
2739                         tabentry->tuples_returned += tabmsg[i].t_tuples_returned;
2740                         tabentry->tuples_fetched += tabmsg[i].t_tuples_fetched;
2741                         tabentry->tuples_inserted += tabmsg[i].t_tuples_inserted;
2742                         tabentry->tuples_updated += tabmsg[i].t_tuples_updated;
2743                         tabentry->tuples_deleted += tabmsg[i].t_tuples_deleted;
2744                         tabentry->blocks_fetched += tabmsg[i].t_blocks_fetched;
2745                         tabentry->blocks_hit += tabmsg[i].t_blocks_hit;
2746                 }
2747
2748                 /*
2749                  * And add the block IO to the database entry.
2750                  */
2751                 dbentry->n_blocks_fetched += tabmsg[i].t_blocks_fetched;
2752                 dbentry->n_blocks_hit += tabmsg[i].t_blocks_hit;
2753         }
2754 }
2755
2756
2757 /* ----------
2758  * pgstat_recv_tabpurge() -
2759  *
2760  *      Arrange for dead table removal.
2761  * ----------
2762  */
2763 static void
2764 pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
2765 {
2766         PgStat_StatDBEntry *dbentry;
2767         PgStat_StatTabEntry *tabentry;
2768         int                     i;
2769
2770         /*
2771          * Make sure the backend is counted for.
2772          */
2773         if (pgstat_add_backend(&msg->m_hdr) < 0)
2774                 return;
2775
2776         dbentry = pgstat_get_db_entry(msg->m_databaseid);
2777
2778         /*
2779          * If the database is marked for destroy, this is a delayed UDP packet
2780          * and the tables will go away at DB destruction.
2781          */
2782         if (dbentry->destroy > 0)
2783                 return;
2784
2785         /*
2786          * Process all table entries in the message.
2787          */
2788         for (i = 0; i < msg->m_nentries; i++)
2789         {
2790                 tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
2791                                                                                    (void *) &(msg->m_tableid[i]),
2792                                                                                                            HASH_FIND, NULL);
2793                 if (tabentry)
2794                         tabentry->destroy = PGSTAT_DESTROY_COUNT;
2795         }
2796 }
2797
2798
2799 /* ----------
2800  * pgstat_recv_dropdb() -
2801  *
2802  *      Arrange for dead database removal
2803  * ----------
2804  */
2805 static void
2806 pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
2807 {
2808         PgStat_StatDBEntry *dbentry;
2809
2810         /*
2811          * Make sure the backend is counted for.
2812          */
2813         if (pgstat_add_backend(&msg->m_hdr) < 0)
2814                 return;
2815
2816         /*
2817          * Lookup the database in the hashtable.
2818          */
2819         dbentry = pgstat_get_db_entry(msg->m_databaseid);
2820
2821         /*
2822          * Mark the database for destruction.
2823          */
2824         dbentry->destroy = PGSTAT_DESTROY_COUNT;
2825 }
2826
2827
2828 /* ----------
2829  * pgstat_recv_resetcounter() -
2830  *
2831  *      Reset the statistics for the specified database.
2832  * ----------
2833  */
2834 static void
2835 pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
2836 {
2837         HASHCTL         hash_ctl;
2838         PgStat_StatDBEntry *dbentry;
2839
2840         /*
2841          * Make sure the backend is counted for.
2842          */
2843         if (pgstat_add_backend(&msg->m_hdr) < 0)
2844                 return;
2845
2846         /*
2847          * Lookup the database in the hashtable.
2848          */
2849         dbentry = pgstat_get_db_entry(msg->m_databaseid);
2850
2851         /*
2852          * We simply throw away all the database's table entries by
2853          * recreating a new hash table for them.
2854          */
2855         if (dbentry->tables != NULL)
2856                 hash_destroy(dbentry->tables);
2857
2858         dbentry->tables = NULL;
2859         dbentry->n_xact_commit = 0;
2860         dbentry->n_xact_rollback = 0;
2861         dbentry->n_blocks_fetched = 0;
2862         dbentry->n_blocks_hit = 0;
2863         dbentry->destroy = 0;
2864
2865         memset(&hash_ctl, 0, sizeof(hash_ctl));
2866         hash_ctl.keysize = sizeof(Oid);
2867         hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
2868         hash_ctl.hash = oid_hash;
2869         dbentry->tables = hash_create("Per-database table",
2870                                                                   PGSTAT_TAB_HASH_SIZE,
2871                                                                   &hash_ctl,
2872                                                                   HASH_ELEM | HASH_FUNCTION);
2873 }