]> granicus.if.org Git - postgresql/blob - src/backend/postmaster/autovacuum.c
Fix assorted race conditions in the new timeout infrastructure.
[postgresql] / src / backend / postmaster / autovacuum.c
1 /*-------------------------------------------------------------------------
2  *
3  * autovacuum.c
4  *
5  * PostgreSQL Integrated Autovacuum Daemon
6  *
7  * The autovacuum system is structured in two different kinds of processes: the
8  * autovacuum launcher and the autovacuum worker.  The launcher is an
9  * always-running process, started by the postmaster when the autovacuum GUC
10  * parameter is set.  The launcher schedules autovacuum workers to be started
11  * when appropriate.  The workers are the processes which execute the actual
12  * vacuuming; they connect to a database as determined in the launcher, and
13  * once connected they examine the catalogs to select the tables to vacuum.
14  *
15  * The autovacuum launcher cannot start the worker processes by itself,
16  * because doing so would cause robustness issues (namely, failure to shut
17  * them down on exceptional conditions, and also, since the launcher is
18  * connected to shared memory and is thus subject to corruption there, it is
19  * not as robust as the postmaster).  So it leaves that task to the postmaster.
20  *
21  * There is an autovacuum shared memory area, where the launcher stores
22  * information about the database it wants vacuumed.  When it wants a new
23  * worker to start, it sets a flag in shared memory and sends a signal to the
24  * postmaster.  Then postmaster knows nothing more than it must start a worker;
25  * so it forks a new child, which turns into a worker.  This new process
26  * connects to shared memory, and there it can inspect the information that the
27  * launcher has set up.
28  *
29  * If the fork() call fails in the postmaster, it sets a flag in the shared
30  * memory area, and sends a signal to the launcher.  The launcher, upon
31  * noticing the flag, can try starting the worker again by resending the
32  * signal.      Note that the failure can only be transient (fork failure due to
33  * high load, memory pressure, too many processes, etc); more permanent
34  * problems, like failure to connect to a database, are detected later in the
35  * worker and dealt with just by having the worker exit normally.  The launcher
36  * will launch a new worker again later, per schedule.
37  *
38  * When the worker is done vacuuming it sends SIGUSR2 to the launcher.  The
39  * launcher then wakes up and is able to launch another worker, if the schedule
40  * is so tight that a new worker is needed immediately.  At this time the
41  * launcher can also balance the settings for the various remaining workers'
42  * cost-based vacuum delay feature.
43  *
44  * Note that there can be more than one worker in a database concurrently.
45  * They will store the table they are currently vacuuming in shared memory, so
46  * that other workers avoid being blocked waiting for the vacuum lock for that
47  * table.  They will also reload the pgstats data just before vacuuming each
48  * table, to avoid vacuuming a table that was just finished being vacuumed by
49  * another worker and thus is no longer noted in shared memory.  However,
50  * there is a window (caused by pgstat delay) on which a worker may choose a
51  * table that was already vacuumed; this is a bug in the current design.
52  *
53  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
54  * Portions Copyright (c) 1994, Regents of the University of California
55  *
56  *
57  * IDENTIFICATION
58  *        src/backend/postmaster/autovacuum.c
59  *
60  *-------------------------------------------------------------------------
61  */
62 #include "postgres.h"
63
64 #include <signal.h>
65 #include <sys/types.h>
66 #include <sys/time.h>
67 #include <time.h>
68 #include <unistd.h>
69
70 #include "access/heapam.h"
71 #include "access/htup_details.h"
72 #include "access/multixact.h"
73 #include "access/reloptions.h"
74 #include "access/transam.h"
75 #include "access/xact.h"
76 #include "catalog/dependency.h"
77 #include "catalog/namespace.h"
78 #include "catalog/pg_database.h"
79 #include "commands/dbcommands.h"
80 #include "commands/vacuum.h"
81 #include "lib/ilist.h"
82 #include "libpq/pqsignal.h"
83 #include "miscadmin.h"
84 #include "pgstat.h"
85 #include "postmaster/autovacuum.h"
86 #include "postmaster/fork_process.h"
87 #include "postmaster/postmaster.h"
88 #include "storage/bufmgr.h"
89 #include "storage/ipc.h"
90 #include "storage/latch.h"
91 #include "storage/pmsignal.h"
92 #include "storage/proc.h"
93 #include "storage/procsignal.h"
94 #include "storage/sinvaladt.h"
95 #include "tcop/tcopprot.h"
96 #include "utils/fmgroids.h"
97 #include "utils/lsyscache.h"
98 #include "utils/memutils.h"
99 #include "utils/ps_status.h"
100 #include "utils/rel.h"
101 #include "utils/snapmgr.h"
102 #include "utils/syscache.h"
103 #include "utils/timeout.h"
104 #include "utils/timestamp.h"
105 #include "utils/tqual.h"
106
107
108 /*
109  * GUC parameters
110  */
111 bool            autovacuum_start_daemon = false;
112 int                     autovacuum_max_workers;
113 int                     autovacuum_naptime;
114 int                     autovacuum_vac_thresh;
115 double          autovacuum_vac_scale;
116 int                     autovacuum_anl_thresh;
117 double          autovacuum_anl_scale;
118 int                     autovacuum_freeze_max_age;
119
120 int                     autovacuum_vac_cost_delay;
121 int                     autovacuum_vac_cost_limit;
122
123 int                     Log_autovacuum_min_duration = -1;
124
125 /* how long to keep pgstat data in the launcher, in milliseconds */
126 #define STATS_READ_DELAY 1000
127
128 /* the minimum allowed time between two awakenings of the launcher */
129 #define MIN_AUTOVAC_SLEEPTIME 100.0             /* milliseconds */
130
131 /* Flags to tell if we are in an autovacuum process */
132 static bool am_autovacuum_launcher = false;
133 static bool am_autovacuum_worker = false;
134
135 /* Flags set by signal handlers */
136 static volatile sig_atomic_t got_SIGHUP = false;
137 static volatile sig_atomic_t got_SIGUSR2 = false;
138 static volatile sig_atomic_t got_SIGTERM = false;
139
140 /* Comparison points for determining whether freeze_max_age is exceeded */
141 static TransactionId recentXid;
142 static MultiXactId recentMulti;
143
144 /* Default freeze ages to use for autovacuum (varies by database) */
145 static int      default_freeze_min_age;
146 static int      default_freeze_table_age;
147
148 /* Memory context for long-lived data */
149 static MemoryContext AutovacMemCxt;
150
151 /* struct to keep track of databases in launcher */
152 typedef struct avl_dbase
153 {
154         Oid                     adl_datid;              /* hash key -- must be first */
155         TimestampTz adl_next_worker;
156         int                     adl_score;
157         dlist_node      adl_node;
158 } avl_dbase;
159
160 /* struct to keep track of databases in worker */
161 typedef struct avw_dbase
162 {
163         Oid                     adw_datid;
164         char       *adw_name;
165         TransactionId adw_frozenxid;
166         MultiXactId adw_minmulti;
167         PgStat_StatDBEntry *adw_entry;
168 } avw_dbase;
169
170 /* struct to keep track of tables to vacuum and/or analyze, in 1st pass */
171 typedef struct av_relation
172 {
173         Oid                     ar_toastrelid;  /* hash key - must be first */
174         Oid                     ar_relid;
175         bool            ar_hasrelopts;
176         AutoVacOpts ar_reloptions;      /* copy of AutoVacOpts from the main table's
177                                                                  * reloptions, or NULL if none */
178 } av_relation;
179
180 /* struct to keep track of tables to vacuum and/or analyze, after rechecking */
181 typedef struct autovac_table
182 {
183         Oid                     at_relid;
184         bool            at_dovacuum;
185         bool            at_doanalyze;
186         int                     at_freeze_min_age;
187         int                     at_freeze_table_age;
188         int                     at_vacuum_cost_delay;
189         int                     at_vacuum_cost_limit;
190         bool            at_wraparound;
191         char       *at_relname;
192         char       *at_nspname;
193         char       *at_datname;
194 } autovac_table;
195
196 /*-------------
197  * This struct holds information about a single worker's whereabouts.  We keep
198  * an array of these in shared memory, sized according to
199  * autovacuum_max_workers.
200  *
201  * wi_links             entry into free list or running list
202  * wi_dboid             OID of the database this worker is supposed to work on
203  * wi_tableoid  OID of the table currently being vacuumed, if any
204  * wi_proc              pointer to PGPROC of the running worker, NULL if not started
205  * wi_launchtime Time at which this worker was launched
206  * wi_cost_*    Vacuum cost-based delay parameters current in this worker
207  *
208  * All fields are protected by AutovacuumLock, except for wi_tableoid which is
209  * protected by AutovacuumScheduleLock (which is read-only for everyone except
210  * that worker itself).
211  *-------------
212  */
213 typedef struct WorkerInfoData
214 {
215         dlist_node      wi_links;
216         Oid                     wi_dboid;
217         Oid                     wi_tableoid;
218         PGPROC     *wi_proc;
219         TimestampTz wi_launchtime;
220         int                     wi_cost_delay;
221         int                     wi_cost_limit;
222         int                     wi_cost_limit_base;
223 } WorkerInfoData;
224
225 typedef struct WorkerInfoData *WorkerInfo;
226
227 /*
228  * Possible signals received by the launcher from remote processes.  These are
229  * stored atomically in shared memory so that other processes can set them
230  * without locking.
231  */
232 typedef enum
233 {
234         AutoVacForkFailed,                      /* failed trying to start a worker */
235         AutoVacRebalance,                       /* rebalance the cost limits */
236         AutoVacNumSignals                       /* must be last */
237 }       AutoVacuumSignal;
238
239 /*-------------
240  * The main autovacuum shmem struct.  On shared memory we store this main
241  * struct and the array of WorkerInfo structs.  This struct keeps:
242  *
243  * av_signal            set by other processes to indicate various conditions
244  * av_launcherpid       the PID of the autovacuum launcher
245  * av_freeWorkers       the WorkerInfo freelist
246  * av_runningWorkers the WorkerInfo non-free queue
247  * av_startingWorker pointer to WorkerInfo currently being started (cleared by
248  *                                      the worker itself as soon as it's up and running)
249  *
250  * This struct is protected by AutovacuumLock, except for av_signal and parts
251  * of the worker list (see above).
252  *-------------
253  */
254 typedef struct
255 {
256         sig_atomic_t av_signal[AutoVacNumSignals];
257         pid_t           av_launcherpid;
258         dlist_head      av_freeWorkers;
259         dlist_head      av_runningWorkers;
260         WorkerInfo      av_startingWorker;
261 } AutoVacuumShmemStruct;
262
263 static AutoVacuumShmemStruct *AutoVacuumShmem;
264
265 /*
266  * the database list (of avl_dbase elements) in the launcher, and the context
267  * that contains it
268  */
269 static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList);
270 static MemoryContext DatabaseListCxt = NULL;
271
272 /* Pointer to my own WorkerInfo, valid on each worker */
273 static WorkerInfo MyWorkerInfo = NULL;
274
275 /* PID of launcher, valid only in worker while shutting down */
276 int                     AutovacuumLauncherPid = 0;
277
278 #ifdef EXEC_BACKEND
279 static pid_t avlauncher_forkexec(void);
280 static pid_t avworker_forkexec(void);
281 #endif
282 NON_EXEC_STATIC void AutoVacWorkerMain(int argc, char *argv[]) __attribute__((noreturn));
283 NON_EXEC_STATIC void AutoVacLauncherMain(int argc, char *argv[]) __attribute__((noreturn));
284
285 static Oid      do_start_worker(void);
286 static void launcher_determine_sleep(bool canlaunch, bool recursing,
287                                                  struct timeval * nap);
288 static void launch_worker(TimestampTz now);
289 static List *get_database_list(void);
290 static void rebuild_database_list(Oid newdb);
291 static int      db_comparator(const void *a, const void *b);
292 static void autovac_balance_cost(void);
293
294 static void do_autovacuum(void);
295 static void FreeWorkerInfo(int code, Datum arg);
296
297 static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map,
298                                           TupleDesc pg_class_desc);
299 static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts,
300                                                   Form_pg_class classForm,
301                                                   PgStat_StatTabEntry *tabentry,
302                                                   bool *dovacuum, bool *doanalyze, bool *wraparound);
303
304 static void autovacuum_do_vac_analyze(autovac_table *tab,
305                                                   BufferAccessStrategy bstrategy);
306 static AutoVacOpts *extract_autovac_opts(HeapTuple tup,
307                                          TupleDesc pg_class_desc);
308 static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared,
309                                                   PgStat_StatDBEntry *shared,
310                                                   PgStat_StatDBEntry *dbentry);
311 static void autovac_report_activity(autovac_table *tab);
312 static void avl_sighup_handler(SIGNAL_ARGS);
313 static void avl_sigusr2_handler(SIGNAL_ARGS);
314 static void avl_sigterm_handler(SIGNAL_ARGS);
315 static void autovac_refresh_stats(void);
316
317
318
319 /********************************************************************
320  *                                        AUTOVACUUM LAUNCHER CODE
321  ********************************************************************/
322
323 #ifdef EXEC_BACKEND
324 /*
325  * forkexec routine for the autovacuum launcher process.
326  *
327  * Format up the arglist, then fork and exec.
328  */
329 static pid_t
330 avlauncher_forkexec(void)
331 {
332         char       *av[10];
333         int                     ac = 0;
334
335         av[ac++] = "postgres";
336         av[ac++] = "--forkavlauncher";
337         av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
338         av[ac] = NULL;
339
340         Assert(ac < lengthof(av));
341
342         return postmaster_forkexec(ac, av);
343 }
344
345 /*
346  * We need this set from the outside, before InitProcess is called
347  */
348 void
349 AutovacuumLauncherIAm(void)
350 {
351         am_autovacuum_launcher = true;
352 }
353 #endif
354
355 /*
356  * Main entry point for autovacuum launcher process, to be called from the
357  * postmaster.
358  */
359 int
360 StartAutoVacLauncher(void)
361 {
362         pid_t           AutoVacPID;
363
364 #ifdef EXEC_BACKEND
365         switch ((AutoVacPID = avlauncher_forkexec()))
366 #else
367         switch ((AutoVacPID = fork_process()))
368 #endif
369         {
370                 case -1:
371                         ereport(LOG,
372                                  (errmsg("could not fork autovacuum launcher process: %m")));
373                         return 0;
374
375 #ifndef EXEC_BACKEND
376                 case 0:
377                         /* in postmaster child ... */
378                         /* Close the postmaster's sockets */
379                         ClosePostmasterPorts(false);
380
381                         /* Lose the postmaster's on-exit routines */
382                         on_exit_reset();
383
384                         AutoVacLauncherMain(0, NULL);
385                         break;
386 #endif
387                 default:
388                         return (int) AutoVacPID;
389         }
390
391         /* shouldn't get here */
392         return 0;
393 }
394
395 /*
396  * Main loop for the autovacuum launcher process.
397  */
398 NON_EXEC_STATIC void
399 AutoVacLauncherMain(int argc, char *argv[])
400 {
401         sigjmp_buf      local_sigjmp_buf;
402
403         /* we are a postmaster subprocess now */
404         IsUnderPostmaster = true;
405         am_autovacuum_launcher = true;
406
407         /* reset MyProcPid */
408         MyProcPid = getpid();
409
410         /* record Start Time for logging */
411         MyStartTime = time(NULL);
412
413         /* Identify myself via ps */
414         init_ps_display("autovacuum launcher process", "", "", "");
415
416         ereport(LOG,
417                         (errmsg("autovacuum launcher started")));
418
419         if (PostAuthDelay)
420                 pg_usleep(PostAuthDelay * 1000000L);
421
422         SetProcessingMode(InitProcessing);
423
424         /*
425          * If possible, make this process a group leader, so that the postmaster
426          * can signal any child processes too.  (autovacuum probably never has any
427          * child processes, but for consistency we make all postmaster child
428          * processes do this.)
429          */
430 #ifdef HAVE_SETSID
431         if (setsid() < 0)
432                 elog(FATAL, "setsid() failed: %m");
433 #endif
434
435         /*
436          * Set up signal handlers.      We operate on databases much like a regular
437          * backend, so we use the same signal handling.  See equivalent code in
438          * tcop/postgres.c.
439          */
440         pqsignal(SIGHUP, avl_sighup_handler);
441         pqsignal(SIGINT, StatementCancelHandler);
442         pqsignal(SIGTERM, avl_sigterm_handler);
443
444         pqsignal(SIGQUIT, quickdie);
445         InitializeTimeouts();           /* establishes SIGALRM handler */
446
447         pqsignal(SIGPIPE, SIG_IGN);
448         pqsignal(SIGUSR1, procsignal_sigusr1_handler);
449         pqsignal(SIGUSR2, avl_sigusr2_handler);
450         pqsignal(SIGFPE, FloatExceptionHandler);
451         pqsignal(SIGCHLD, SIG_DFL);
452
453         /* Early initialization */
454         BaseInit();
455
456         /*
457          * Create a per-backend PGPROC struct in shared memory, except in the
458          * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
459          * this before we can use LWLocks (and in the EXEC_BACKEND case we already
460          * had to do some stuff with LWLocks).
461          */
462 #ifndef EXEC_BACKEND
463         InitProcess();
464 #endif
465
466         InitPostgres(NULL, InvalidOid, NULL, NULL);
467
468         SetProcessingMode(NormalProcessing);
469
470         /*
471          * Create a memory context that we will do all our work in.  We do this so
472          * that we can reset the context during error recovery and thereby avoid
473          * possible memory leaks.
474          */
475         AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
476                                                                                   "Autovacuum Launcher",
477                                                                                   ALLOCSET_DEFAULT_MINSIZE,
478                                                                                   ALLOCSET_DEFAULT_INITSIZE,
479                                                                                   ALLOCSET_DEFAULT_MAXSIZE);
480         MemoryContextSwitchTo(AutovacMemCxt);
481
482         /*
483          * If an exception is encountered, processing resumes here.
484          *
485          * This code is a stripped down version of PostgresMain error recovery.
486          */
487         if (sigsetjmp(local_sigjmp_buf, 1) != 0)
488         {
489                 /* since not using PG_TRY, must reset error stack by hand */
490                 error_context_stack = NULL;
491
492                 /* Prevents interrupts while cleaning up */
493                 HOLD_INTERRUPTS();
494
495                 /* Forget any pending QueryCancel or timeout request */
496                 disable_all_timeouts(false);
497                 QueryCancelPending = false;             /* second to avoid race condition */
498
499                 /* Report the error to the server log */
500                 EmitErrorReport();
501
502                 /* Abort the current transaction in order to recover */
503                 AbortCurrentTransaction();
504
505                 /*
506                  * Now return to normal top-level context and clear ErrorContext for
507                  * next time.
508                  */
509                 MemoryContextSwitchTo(AutovacMemCxt);
510                 FlushErrorState();
511
512                 /* Flush any leaked data in the top-level context */
513                 MemoryContextResetAndDeleteChildren(AutovacMemCxt);
514
515                 /* don't leave dangling pointers to freed memory */
516                 DatabaseListCxt = NULL;
517                 dlist_init(&DatabaseList);
518
519                 /*
520                  * Make sure pgstat also considers our stat data as gone.  Note: we
521                  * mustn't use autovac_refresh_stats here.
522                  */
523                 pgstat_clear_snapshot();
524
525                 /* Now we can allow interrupts again */
526                 RESUME_INTERRUPTS();
527
528                 /*
529                  * Sleep at least 1 second after any error.  We don't want to be
530                  * filling the error logs as fast as we can.
531                  */
532                 pg_usleep(1000000L);
533         }
534
535         /* We can now handle ereport(ERROR) */
536         PG_exception_stack = &local_sigjmp_buf;
537
538         /* must unblock signals before calling rebuild_database_list */
539         PG_SETMASK(&UnBlockSig);
540
541         /*
542          * Force zero_damaged_pages OFF in the autovac process, even if it is set
543          * in postgresql.conf.  We don't really want such a dangerous option being
544          * applied non-interactively.
545          */
546         SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
547
548         /*
549          * Force statement_timeout and lock_timeout to zero to avoid letting these
550          * settings prevent regular maintenance from being executed.
551          */
552         SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
553         SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
554
555         /*
556          * Force default_transaction_isolation to READ COMMITTED.  We don't want
557          * to pay the overhead of serializable mode, nor add any risk of causing
558          * deadlocks or delaying other transactions.
559          */
560         SetConfigOption("default_transaction_isolation", "read committed",
561                                         PGC_SUSET, PGC_S_OVERRIDE);
562
563         /* in emergency mode, just start a worker and go away */
564         if (!AutoVacuumingActive())
565         {
566                 do_start_worker();
567                 proc_exit(0);                   /* done */
568         }
569
570         AutoVacuumShmem->av_launcherpid = MyProcPid;
571
572         /*
573          * Create the initial database list.  The invariant we want this list to
574          * keep is that it's ordered by decreasing next_time.  As soon as an entry
575          * is updated to a higher time, it will be moved to the front (which is
576          * correct because the only operation is to add autovacuum_naptime to the
577          * entry, and time always increases).
578          */
579         rebuild_database_list(InvalidOid);
580
581         for (;;)
582         {
583                 struct timeval nap;
584                 TimestampTz current_time = 0;
585                 bool            can_launch;
586                 int                     rc;
587
588                 /*
589                  * This loop is a bit different from the normal use of WaitLatch,
590                  * because we'd like to sleep before the first launch of a child
591                  * process.  So it's WaitLatch, then ResetLatch, then check for
592                  * wakening conditions.
593                  */
594
595                 launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers),
596                                                                  false, &nap);
597
598                 /* Allow sinval catchup interrupts while sleeping */
599                 EnableCatchupInterrupt();
600
601                 /*
602                  * Wait until naptime expires or we get some type of signal (all the
603                  * signal handlers will wake us by calling SetLatch).
604                  */
605                 rc = WaitLatch(&MyProc->procLatch,
606                                            WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
607                                            (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L));
608
609                 ResetLatch(&MyProc->procLatch);
610
611                 DisableCatchupInterrupt();
612
613                 /*
614                  * Emergency bailout if postmaster has died.  This is to avoid the
615                  * necessity for manual cleanup of all postmaster children.
616                  */
617                 if (rc & WL_POSTMASTER_DEATH)
618                         proc_exit(1);
619
620                 /* the normal shutdown case */
621                 if (got_SIGTERM)
622                         break;
623
624                 if (got_SIGHUP)
625                 {
626                         got_SIGHUP = false;
627                         ProcessConfigFile(PGC_SIGHUP);
628
629                         /* shutdown requested in config file? */
630                         if (!AutoVacuumingActive())
631                                 break;
632
633                         /* rebalance in case the default cost parameters changed */
634                         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
635                         autovac_balance_cost();
636                         LWLockRelease(AutovacuumLock);
637
638                         /* rebuild the list in case the naptime changed */
639                         rebuild_database_list(InvalidOid);
640                 }
641
642                 /*
643                  * a worker finished, or postmaster signalled failure to start a
644                  * worker
645                  */
646                 if (got_SIGUSR2)
647                 {
648                         got_SIGUSR2 = false;
649
650                         /* rebalance cost limits, if needed */
651                         if (AutoVacuumShmem->av_signal[AutoVacRebalance])
652                         {
653                                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
654                                 AutoVacuumShmem->av_signal[AutoVacRebalance] = false;
655                                 autovac_balance_cost();
656                                 LWLockRelease(AutovacuumLock);
657                         }
658
659                         if (AutoVacuumShmem->av_signal[AutoVacForkFailed])
660                         {
661                                 /*
662                                  * If the postmaster failed to start a new worker, we sleep
663                                  * for a little while and resend the signal.  The new worker's
664                                  * state is still in memory, so this is sufficient.  After
665                                  * that, we restart the main loop.
666                                  *
667                                  * XXX should we put a limit to the number of times we retry?
668                                  * I don't think it makes much sense, because a future start
669                                  * of a worker will continue to fail in the same way.
670                                  */
671                                 AutoVacuumShmem->av_signal[AutoVacForkFailed] = false;
672                                 pg_usleep(1000000L);    /* 1s */
673                                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
674                                 continue;
675                         }
676                 }
677
678                 /*
679                  * There are some conditions that we need to check before trying to
680                  * start a launcher.  First, we need to make sure that there is a
681                  * launcher slot available.  Second, we need to make sure that no
682                  * other worker failed while starting up.
683                  */
684
685                 current_time = GetCurrentTimestamp();
686                 LWLockAcquire(AutovacuumLock, LW_SHARED);
687
688                 can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers);
689
690                 if (AutoVacuumShmem->av_startingWorker != NULL)
691                 {
692                         int                     waittime;
693                         WorkerInfo      worker = AutoVacuumShmem->av_startingWorker;
694
695                         /*
696                          * We can't launch another worker when another one is still
697                          * starting up (or failed while doing so), so just sleep for a bit
698                          * more; that worker will wake us up again as soon as it's ready.
699                          * We will only wait autovacuum_naptime seconds (up to a maximum
700                          * of 60 seconds) for this to happen however.  Note that failure
701                          * to connect to a particular database is not a problem here,
702                          * because the worker removes itself from the startingWorker
703                          * pointer before trying to connect.  Problems detected by the
704                          * postmaster (like fork() failure) are also reported and handled
705                          * differently.  The only problems that may cause this code to
706                          * fire are errors in the earlier sections of AutoVacWorkerMain,
707                          * before the worker removes the WorkerInfo from the
708                          * startingWorker pointer.
709                          */
710                         waittime = Min(autovacuum_naptime, 60) * 1000;
711                         if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time,
712                                                                                    waittime))
713                         {
714                                 LWLockRelease(AutovacuumLock);
715                                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
716
717                                 /*
718                                  * No other process can put a worker in starting mode, so if
719                                  * startingWorker is still INVALID after exchanging our lock,
720                                  * we assume it's the same one we saw above (so we don't
721                                  * recheck the launch time).
722                                  */
723                                 if (AutoVacuumShmem->av_startingWorker != NULL)
724                                 {
725                                         worker = AutoVacuumShmem->av_startingWorker;
726                                         worker->wi_dboid = InvalidOid;
727                                         worker->wi_tableoid = InvalidOid;
728                                         worker->wi_proc = NULL;
729                                         worker->wi_launchtime = 0;
730                                         dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
731                                                                         &worker->wi_links);
732                                         AutoVacuumShmem->av_startingWorker = NULL;
733                                         elog(WARNING, "worker took too long to start; canceled");
734                                 }
735                         }
736                         else
737                                 can_launch = false;
738                 }
739                 LWLockRelease(AutovacuumLock);  /* either shared or exclusive */
740
741                 /* if we can't do anything, just go back to sleep */
742                 if (!can_launch)
743                         continue;
744
745                 /* We're OK to start a new worker */
746
747                 if (dlist_is_empty(&DatabaseList))
748                 {
749                         /*
750                          * Special case when the list is empty: start a worker right away.
751                          * This covers the initial case, when no database is in pgstats
752                          * (thus the list is empty).  Note that the constraints in
753                          * launcher_determine_sleep keep us from starting workers too
754                          * quickly (at most once every autovacuum_naptime when the list is
755                          * empty).
756                          */
757                         launch_worker(current_time);
758                 }
759                 else
760                 {
761                         /*
762                          * because rebuild_database_list constructs a list with most
763                          * distant adl_next_worker first, we obtain our database from the
764                          * tail of the list.
765                          */
766                         avl_dbase  *avdb;
767
768                         avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
769
770                         /*
771                          * launch a worker if next_worker is right now or it is in the
772                          * past
773                          */
774                         if (TimestampDifferenceExceeds(avdb->adl_next_worker,
775                                                                                    current_time, 0))
776                                 launch_worker(current_time);
777                 }
778         }
779
780         /* Normal exit from the autovac launcher is here */
781         ereport(LOG,
782                         (errmsg("autovacuum launcher shutting down")));
783         AutoVacuumShmem->av_launcherpid = 0;
784
785         proc_exit(0);                           /* done */
786 }
787
788 /*
789  * Determine the time to sleep, based on the database list.
790  *
791  * The "canlaunch" parameter indicates whether we can start a worker right now,
792  * for example due to the workers being all busy.  If this is false, we will
793  * cause a long sleep, which will be interrupted when a worker exits.
794  */
795 static void
796 launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval * nap)
797 {
798         /*
799          * We sleep until the next scheduled vacuum.  We trust that when the
800          * database list was built, care was taken so that no entries have times
801          * in the past; if the first entry has too close a next_worker value, or a
802          * time in the past, we will sleep a small nominal time.
803          */
804         if (!canlaunch)
805         {
806                 nap->tv_sec = autovacuum_naptime;
807                 nap->tv_usec = 0;
808         }
809         else if (!dlist_is_empty(&DatabaseList))
810         {
811                 TimestampTz current_time = GetCurrentTimestamp();
812                 TimestampTz next_wakeup;
813                 avl_dbase  *avdb;
814                 long            secs;
815                 int                     usecs;
816
817                 avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
818
819                 next_wakeup = avdb->adl_next_worker;
820                 TimestampDifference(current_time, next_wakeup, &secs, &usecs);
821
822                 nap->tv_sec = secs;
823                 nap->tv_usec = usecs;
824         }
825         else
826         {
827                 /* list is empty, sleep for whole autovacuum_naptime seconds  */
828                 nap->tv_sec = autovacuum_naptime;
829                 nap->tv_usec = 0;
830         }
831
832         /*
833          * If the result is exactly zero, it means a database had an entry with
834          * time in the past.  Rebuild the list so that the databases are evenly
835          * distributed again, and recalculate the time to sleep.  This can happen
836          * if there are more tables needing vacuum than workers, and they all take
837          * longer to vacuum than autovacuum_naptime.
838          *
839          * We only recurse once.  rebuild_database_list should always return times
840          * in the future, but it seems best not to trust too much on that.
841          */
842         if (nap->tv_sec == 0 && nap->tv_usec == 0 && !recursing)
843         {
844                 rebuild_database_list(InvalidOid);
845                 launcher_determine_sleep(canlaunch, true, nap);
846                 return;
847         }
848
849         /* The smallest time we'll allow the launcher to sleep. */
850         if (nap->tv_sec <= 0 && nap->tv_usec <= MIN_AUTOVAC_SLEEPTIME * 1000)
851         {
852                 nap->tv_sec = 0;
853                 nap->tv_usec = MIN_AUTOVAC_SLEEPTIME * 1000;
854         }
855 }
856
857 /*
858  * Build an updated DatabaseList.  It must only contain databases that appear
859  * in pgstats, and must be sorted by next_worker from highest to lowest,
860  * distributed regularly across the next autovacuum_naptime interval.
861  *
862  * Receives the Oid of the database that made this list be generated (we call
863  * this the "new" database, because when the database was already present on
864  * the list, we expect that this function is not called at all).  The
865  * preexisting list, if any, will be used to preserve the order of the
866  * databases in the autovacuum_naptime period.  The new database is put at the
867  * end of the interval.  The actual values are not saved, which should not be
868  * much of a problem.
869  */
870 static void
871 rebuild_database_list(Oid newdb)
872 {
873         List       *dblist;
874         ListCell   *cell;
875         MemoryContext newcxt;
876         MemoryContext oldcxt;
877         MemoryContext tmpcxt;
878         HASHCTL         hctl;
879         int                     score;
880         int                     nelems;
881         HTAB       *dbhash;
882         dlist_iter      iter;
883
884         /* use fresh stats */
885         autovac_refresh_stats();
886
887         newcxt = AllocSetContextCreate(AutovacMemCxt,
888                                                                    "AV dblist",
889                                                                    ALLOCSET_DEFAULT_MINSIZE,
890                                                                    ALLOCSET_DEFAULT_INITSIZE,
891                                                                    ALLOCSET_DEFAULT_MAXSIZE);
892         tmpcxt = AllocSetContextCreate(newcxt,
893                                                                    "tmp AV dblist",
894                                                                    ALLOCSET_DEFAULT_MINSIZE,
895                                                                    ALLOCSET_DEFAULT_INITSIZE,
896                                                                    ALLOCSET_DEFAULT_MAXSIZE);
897         oldcxt = MemoryContextSwitchTo(tmpcxt);
898
899         /*
900          * Implementing this is not as simple as it sounds, because we need to put
901          * the new database at the end of the list; next the databases that were
902          * already on the list, and finally (at the tail of the list) all the
903          * other databases that are not on the existing list.
904          *
905          * To do this, we build an empty hash table of scored databases.  We will
906          * start with the lowest score (zero) for the new database, then
907          * increasing scores for the databases in the existing list, in order, and
908          * lastly increasing scores for all databases gotten via
909          * get_database_list() that are not already on the hash.
910          *
911          * Then we will put all the hash elements into an array, sort the array by
912          * score, and finally put the array elements into the new doubly linked
913          * list.
914          */
915         hctl.keysize = sizeof(Oid);
916         hctl.entrysize = sizeof(avl_dbase);
917         hctl.hash = oid_hash;
918         hctl.hcxt = tmpcxt;
919         dbhash = hash_create("db hash", 20, &hctl,      /* magic number here FIXME */
920                                                  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
921
922         /* start by inserting the new database */
923         score = 0;
924         if (OidIsValid(newdb))
925         {
926                 avl_dbase  *db;
927                 PgStat_StatDBEntry *entry;
928
929                 /* only consider this database if it has a pgstat entry */
930                 entry = pgstat_fetch_stat_dbentry(newdb);
931                 if (entry != NULL)
932                 {
933                         /* we assume it isn't found because the hash was just created */
934                         db = hash_search(dbhash, &newdb, HASH_ENTER, NULL);
935
936                         /* hash_search already filled in the key */
937                         db->adl_score = score++;
938                         /* next_worker is filled in later */
939                 }
940         }
941
942         /* Now insert the databases from the existing list */
943         dlist_foreach(iter, &DatabaseList)
944         {
945                 avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
946                 avl_dbase  *db;
947                 bool            found;
948                 PgStat_StatDBEntry *entry;
949
950                 /*
951                  * skip databases with no stat entries -- in particular, this gets rid
952                  * of dropped databases
953                  */
954                 entry = pgstat_fetch_stat_dbentry(avdb->adl_datid);
955                 if (entry == NULL)
956                         continue;
957
958                 db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found);
959
960                 if (!found)
961                 {
962                         /* hash_search already filled in the key */
963                         db->adl_score = score++;
964                         /* next_worker is filled in later */
965                 }
966         }
967
968         /* finally, insert all qualifying databases not previously inserted */
969         dblist = get_database_list();
970         foreach(cell, dblist)
971         {
972                 avw_dbase  *avdb = lfirst(cell);
973                 avl_dbase  *db;
974                 bool            found;
975                 PgStat_StatDBEntry *entry;
976
977                 /* only consider databases with a pgstat entry */
978                 entry = pgstat_fetch_stat_dbentry(avdb->adw_datid);
979                 if (entry == NULL)
980                         continue;
981
982                 db = hash_search(dbhash, &(avdb->adw_datid), HASH_ENTER, &found);
983                 /* only update the score if the database was not already on the hash */
984                 if (!found)
985                 {
986                         /* hash_search already filled in the key */
987                         db->adl_score = score++;
988                         /* next_worker is filled in later */
989                 }
990         }
991         nelems = score;
992
993         /* from here on, the allocated memory belongs to the new list */
994         MemoryContextSwitchTo(newcxt);
995         dlist_init(&DatabaseList);
996
997         if (nelems > 0)
998         {
999                 TimestampTz current_time;
1000                 int                     millis_increment;
1001                 avl_dbase  *dbary;
1002                 avl_dbase  *db;
1003                 HASH_SEQ_STATUS seq;
1004                 int                     i;
1005
1006                 /* put all the hash elements into an array */
1007                 dbary = palloc(nelems * sizeof(avl_dbase));
1008
1009                 i = 0;
1010                 hash_seq_init(&seq, dbhash);
1011                 while ((db = hash_seq_search(&seq)) != NULL)
1012                         memcpy(&(dbary[i++]), db, sizeof(avl_dbase));
1013
1014                 /* sort the array */
1015                 qsort(dbary, nelems, sizeof(avl_dbase), db_comparator);
1016
1017                 /*
1018                  * Determine the time interval between databases in the schedule. If
1019                  * we see that the configured naptime would take us to sleep times
1020                  * lower than our min sleep time (which launcher_determine_sleep is
1021                  * coded not to allow), silently use a larger naptime (but don't touch
1022                  * the GUC variable).
1023                  */
1024                 millis_increment = 1000.0 * autovacuum_naptime / nelems;
1025                 if (millis_increment <= MIN_AUTOVAC_SLEEPTIME)
1026                         millis_increment = MIN_AUTOVAC_SLEEPTIME * 1.1;
1027
1028                 current_time = GetCurrentTimestamp();
1029
1030                 /*
1031                  * move the elements from the array into the dllist, setting the
1032                  * next_worker while walking the array
1033                  */
1034                 for (i = 0; i < nelems; i++)
1035                 {
1036                         avl_dbase  *db = &(dbary[i]);
1037
1038                         current_time = TimestampTzPlusMilliseconds(current_time,
1039                                                                                                            millis_increment);
1040                         db->adl_next_worker = current_time;
1041
1042                         /* later elements should go closer to the head of the list */
1043                         dlist_push_head(&DatabaseList, &db->adl_node);
1044                 }
1045         }
1046
1047         /* all done, clean up memory */
1048         if (DatabaseListCxt != NULL)
1049                 MemoryContextDelete(DatabaseListCxt);
1050         MemoryContextDelete(tmpcxt);
1051         DatabaseListCxt = newcxt;
1052         MemoryContextSwitchTo(oldcxt);
1053 }
1054
1055 /* qsort comparator for avl_dbase, using adl_score */
1056 static int
1057 db_comparator(const void *a, const void *b)
1058 {
1059         if (((const avl_dbase *) a)->adl_score == ((const avl_dbase *) b)->adl_score)
1060                 return 0;
1061         else
1062                 return (((const avl_dbase *) a)->adl_score < ((const avl_dbase *) b)->adl_score) ? 1 : -1;
1063 }
1064
1065 /*
1066  * do_start_worker
1067  *
1068  * Bare-bones procedure for starting an autovacuum worker from the launcher.
1069  * It determines what database to work on, sets up shared memory stuff and
1070  * signals postmaster to start the worker.      It fails gracefully if invoked when
1071  * autovacuum_workers are already active.
1072  *
1073  * Return value is the OID of the database that the worker is going to process,
1074  * or InvalidOid if no worker was actually started.
1075  */
1076 static Oid
1077 do_start_worker(void)
1078 {
1079         List       *dblist;
1080         ListCell   *cell;
1081         TransactionId xidForceLimit;
1082         MultiXactId multiForceLimit;
1083         bool            for_xid_wrap;
1084         bool            for_multi_wrap;
1085         avw_dbase  *avdb;
1086         TimestampTz current_time;
1087         bool            skipit = false;
1088         Oid                     retval = InvalidOid;
1089         MemoryContext tmpcxt,
1090                                 oldcxt;
1091
1092         /* return quickly when there are no free workers */
1093         LWLockAcquire(AutovacuumLock, LW_SHARED);
1094         if (dlist_is_empty(&AutoVacuumShmem->av_freeWorkers))
1095         {
1096                 LWLockRelease(AutovacuumLock);
1097                 return InvalidOid;
1098         }
1099         LWLockRelease(AutovacuumLock);
1100
1101         /*
1102          * Create and switch to a temporary context to avoid leaking the memory
1103          * allocated for the database list.
1104          */
1105         tmpcxt = AllocSetContextCreate(CurrentMemoryContext,
1106                                                                    "Start worker tmp cxt",
1107                                                                    ALLOCSET_DEFAULT_MINSIZE,
1108                                                                    ALLOCSET_DEFAULT_INITSIZE,
1109                                                                    ALLOCSET_DEFAULT_MAXSIZE);
1110         oldcxt = MemoryContextSwitchTo(tmpcxt);
1111
1112         /* use fresh stats */
1113         autovac_refresh_stats();
1114
1115         /* Get a list of databases */
1116         dblist = get_database_list();
1117
1118         /*
1119          * Determine the oldest datfrozenxid/relfrozenxid that we will allow to
1120          * pass without forcing a vacuum.  (This limit can be tightened for
1121          * particular tables, but not loosened.)
1122          */
1123         recentXid = ReadNewTransactionId();
1124         xidForceLimit = recentXid - autovacuum_freeze_max_age;
1125         /* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */
1126         /* this can cause the limit to go backwards by 3, but that's OK */
1127         if (xidForceLimit < FirstNormalTransactionId)
1128                 xidForceLimit -= FirstNormalTransactionId;
1129
1130         /* Also determine the oldest datminmxid we will consider. */
1131         recentMulti = ReadNextMultiXactId();
1132         multiForceLimit = recentMulti - autovacuum_freeze_max_age;
1133         if (multiForceLimit < FirstMultiXactId)
1134                 multiForceLimit -= FirstMultiXactId;
1135
1136         /*
1137          * Choose a database to connect to.  We pick the database that was least
1138          * recently auto-vacuumed, or one that needs vacuuming to prevent Xid
1139          * wraparound-related data loss.  If any db at risk of Xid wraparound is
1140          * found, we pick the one with oldest datfrozenxid, independently of
1141          * autovacuum times; similarly we pick the one with the oldest datminmxid
1142          * if any is in MultiXactId wraparound.  Note that those in Xid wraparound
1143          * danger are given more priority than those in multi wraparound danger.
1144          *
1145          * Note that a database with no stats entry is not considered, except for
1146          * Xid wraparound purposes.  The theory is that if no one has ever
1147          * connected to it since the stats were last initialized, it doesn't need
1148          * vacuuming.
1149          *
1150          * XXX This could be improved if we had more info about whether it needs
1151          * vacuuming before connecting to it.  Perhaps look through the pgstats
1152          * data for the database's tables?  One idea is to keep track of the
1153          * number of new and dead tuples per database in pgstats.  However it
1154          * isn't clear how to construct a metric that measures that and not cause
1155          * starvation for less busy databases.
1156          */
1157         avdb = NULL;
1158         for_xid_wrap = false;
1159         for_multi_wrap = false;
1160         current_time = GetCurrentTimestamp();
1161         foreach(cell, dblist)
1162         {
1163                 avw_dbase  *tmp = lfirst(cell);
1164                 dlist_iter      iter;
1165
1166                 /* Check to see if this one is at risk of wraparound */
1167                 if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit))
1168                 {
1169                         if (avdb == NULL ||
1170                                 TransactionIdPrecedes(tmp->adw_frozenxid,
1171                                                                           avdb->adw_frozenxid))
1172                                 avdb = tmp;
1173                         for_xid_wrap = true;
1174                         continue;
1175                 }
1176                 else if (for_xid_wrap)
1177                         continue;                       /* ignore not-at-risk DBs */
1178                 else if (MultiXactIdPrecedes(tmp->adw_minmulti, multiForceLimit))
1179                 {
1180                         if (avdb == NULL ||
1181                                 MultiXactIdPrecedes(tmp->adw_minmulti, avdb->adw_minmulti))
1182                                 avdb = tmp;
1183                         for_multi_wrap = true;
1184                         continue;
1185                 }
1186                 else if (for_multi_wrap)
1187                         continue;                       /* ignore not-at-risk DBs */
1188
1189                 /* Find pgstat entry if any */
1190                 tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid);
1191
1192                 /*
1193                  * Skip a database with no pgstat entry; it means it hasn't seen any
1194                  * activity.
1195                  */
1196                 if (!tmp->adw_entry)
1197                         continue;
1198
1199                 /*
1200                  * Also, skip a database that appears on the database list as having
1201                  * been processed recently (less than autovacuum_naptime seconds ago).
1202                  * We do this so that we don't select a database which we just
1203                  * selected, but that pgstat hasn't gotten around to updating the last
1204                  * autovacuum time yet.
1205                  */
1206                 skipit = false;
1207
1208                 dlist_reverse_foreach(iter, &DatabaseList)
1209                 {
1210                         avl_dbase  *dbp = dlist_container(avl_dbase, adl_node, iter.cur);
1211
1212                         if (dbp->adl_datid == tmp->adw_datid)
1213                         {
1214                                 /*
1215                                  * Skip this database if its next_worker value falls between
1216                                  * the current time and the current time plus naptime.
1217                                  */
1218                                 if (!TimestampDifferenceExceeds(dbp->adl_next_worker,
1219                                                                                                 current_time, 0) &&
1220                                         !TimestampDifferenceExceeds(current_time,
1221                                                                                                 dbp->adl_next_worker,
1222                                                                                                 autovacuum_naptime * 1000))
1223                                         skipit = true;
1224
1225                                 break;
1226                         }
1227                 }
1228                 if (skipit)
1229                         continue;
1230
1231                 /*
1232                  * Remember the db with oldest autovac time.  (If we are here, both
1233                  * tmp->entry and db->entry must be non-null.)
1234                  */
1235                 if (avdb == NULL ||
1236                         tmp->adw_entry->last_autovac_time < avdb->adw_entry->last_autovac_time)
1237                         avdb = tmp;
1238         }
1239
1240         /* Found a database -- process it */
1241         if (avdb != NULL)
1242         {
1243                 WorkerInfo      worker;
1244                 dlist_node *wptr;
1245
1246                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1247
1248                 /*
1249                  * Get a worker entry from the freelist.  We checked above, so there
1250                  * really should be a free slot.
1251                  */
1252                 wptr = dlist_pop_head_node(&AutoVacuumShmem->av_freeWorkers);
1253
1254                 worker = dlist_container(WorkerInfoData, wi_links, wptr);
1255                 worker->wi_dboid = avdb->adw_datid;
1256                 worker->wi_proc = NULL;
1257                 worker->wi_launchtime = GetCurrentTimestamp();
1258
1259                 AutoVacuumShmem->av_startingWorker = worker;
1260
1261                 LWLockRelease(AutovacuumLock);
1262
1263                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
1264
1265                 retval = avdb->adw_datid;
1266         }
1267         else if (skipit)
1268         {
1269                 /*
1270                  * If we skipped all databases on the list, rebuild it, because it
1271                  * probably contains a dropped database.
1272                  */
1273                 rebuild_database_list(InvalidOid);
1274         }
1275
1276         MemoryContextSwitchTo(oldcxt);
1277         MemoryContextDelete(tmpcxt);
1278
1279         return retval;
1280 }
1281
1282 /*
1283  * launch_worker
1284  *
1285  * Wrapper for starting a worker from the launcher.  Besides actually starting
1286  * it, update the database list to reflect the next time that another one will
1287  * need to be started on the selected database.  The actual database choice is
1288  * left to do_start_worker.
1289  *
1290  * This routine is also expected to insert an entry into the database list if
1291  * the selected database was previously absent from the list.
1292  */
1293 static void
1294 launch_worker(TimestampTz now)
1295 {
1296         Oid                     dbid;
1297         dlist_iter      iter;
1298
1299         dbid = do_start_worker();
1300         if (OidIsValid(dbid))
1301         {
1302                 bool            found = false;
1303
1304                 /*
1305                  * Walk the database list and update the corresponding entry.  If the
1306                  * database is not on the list, we'll recreate the list.
1307                  */
1308                 dlist_foreach(iter, &DatabaseList)
1309                 {
1310                         avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
1311
1312                         if (avdb->adl_datid == dbid)
1313                         {
1314                                 found = true;
1315
1316                                 /*
1317                                  * add autovacuum_naptime seconds to the current time, and use
1318                                  * that as the new "next_worker" field for this database.
1319                                  */
1320                                 avdb->adl_next_worker =
1321                                         TimestampTzPlusMilliseconds(now, autovacuum_naptime * 1000);
1322
1323                                 dlist_move_head(&DatabaseList, iter.cur);
1324                                 break;
1325                         }
1326                 }
1327
1328                 /*
1329                  * If the database was not present in the database list, we rebuild
1330                  * the list.  It's possible that the database does not get into the
1331                  * list anyway, for example if it's a database that doesn't have a
1332                  * pgstat entry, but this is not a problem because we don't want to
1333                  * schedule workers regularly into those in any case.
1334                  */
1335                 if (!found)
1336                         rebuild_database_list(dbid);
1337         }
1338 }
1339
1340 /*
1341  * Called from postmaster to signal a failure to fork a process to become
1342  * worker.      The postmaster should kill(SIGUSR2) the launcher shortly
1343  * after calling this function.
1344  */
1345 void
1346 AutoVacWorkerFailed(void)
1347 {
1348         AutoVacuumShmem->av_signal[AutoVacForkFailed] = true;
1349 }
1350
1351 /* SIGHUP: set flag to re-read config file at next convenient time */
1352 static void
1353 avl_sighup_handler(SIGNAL_ARGS)
1354 {
1355         int                     save_errno = errno;
1356
1357         got_SIGHUP = true;
1358         if (MyProc)
1359                 SetLatch(&MyProc->procLatch);
1360
1361         errno = save_errno;
1362 }
1363
1364 /* SIGUSR2: a worker is up and running, or just finished, or failed to fork */
1365 static void
1366 avl_sigusr2_handler(SIGNAL_ARGS)
1367 {
1368         int                     save_errno = errno;
1369
1370         got_SIGUSR2 = true;
1371         if (MyProc)
1372                 SetLatch(&MyProc->procLatch);
1373
1374         errno = save_errno;
1375 }
1376
1377 /* SIGTERM: time to die */
1378 static void
1379 avl_sigterm_handler(SIGNAL_ARGS)
1380 {
1381         int                     save_errno = errno;
1382
1383         got_SIGTERM = true;
1384         if (MyProc)
1385                 SetLatch(&MyProc->procLatch);
1386
1387         errno = save_errno;
1388 }
1389
1390
1391 /********************************************************************
1392  *                                        AUTOVACUUM WORKER CODE
1393  ********************************************************************/
1394
1395 #ifdef EXEC_BACKEND
1396 /*
1397  * forkexec routines for the autovacuum worker.
1398  *
1399  * Format up the arglist, then fork and exec.
1400  */
1401 static pid_t
1402 avworker_forkexec(void)
1403 {
1404         char       *av[10];
1405         int                     ac = 0;
1406
1407         av[ac++] = "postgres";
1408         av[ac++] = "--forkavworker";
1409         av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
1410         av[ac] = NULL;
1411
1412         Assert(ac < lengthof(av));
1413
1414         return postmaster_forkexec(ac, av);
1415 }
1416
1417 /*
1418  * We need this set from the outside, before InitProcess is called
1419  */
1420 void
1421 AutovacuumWorkerIAm(void)
1422 {
1423         am_autovacuum_worker = true;
1424 }
1425 #endif
1426
1427 /*
1428  * Main entry point for autovacuum worker process.
1429  *
1430  * This code is heavily based on pgarch.c, q.v.
1431  */
1432 int
1433 StartAutoVacWorker(void)
1434 {
1435         pid_t           worker_pid;
1436
1437 #ifdef EXEC_BACKEND
1438         switch ((worker_pid = avworker_forkexec()))
1439 #else
1440         switch ((worker_pid = fork_process()))
1441 #endif
1442         {
1443                 case -1:
1444                         ereport(LOG,
1445                                         (errmsg("could not fork autovacuum worker process: %m")));
1446                         return 0;
1447
1448 #ifndef EXEC_BACKEND
1449                 case 0:
1450                         /* in postmaster child ... */
1451                         /* Close the postmaster's sockets */
1452                         ClosePostmasterPorts(false);
1453
1454                         /* Lose the postmaster's on-exit routines */
1455                         on_exit_reset();
1456
1457                         AutoVacWorkerMain(0, NULL);
1458                         break;
1459 #endif
1460                 default:
1461                         return (int) worker_pid;
1462         }
1463
1464         /* shouldn't get here */
1465         return 0;
1466 }
1467
1468 /*
1469  * AutoVacWorkerMain
1470  */
1471 NON_EXEC_STATIC void
1472 AutoVacWorkerMain(int argc, char *argv[])
1473 {
1474         sigjmp_buf      local_sigjmp_buf;
1475         Oid                     dbid;
1476
1477         /* we are a postmaster subprocess now */
1478         IsUnderPostmaster = true;
1479         am_autovacuum_worker = true;
1480
1481         /* reset MyProcPid */
1482         MyProcPid = getpid();
1483
1484         /* record Start Time for logging */
1485         MyStartTime = time(NULL);
1486
1487         /* Identify myself via ps */
1488         init_ps_display("autovacuum worker process", "", "", "");
1489
1490         SetProcessingMode(InitProcessing);
1491
1492         /*
1493          * If possible, make this process a group leader, so that the postmaster
1494          * can signal any child processes too.  (autovacuum probably never has any
1495          * child processes, but for consistency we make all postmaster child
1496          * processes do this.)
1497          */
1498 #ifdef HAVE_SETSID
1499         if (setsid() < 0)
1500                 elog(FATAL, "setsid() failed: %m");
1501 #endif
1502
1503         /*
1504          * Set up signal handlers.      We operate on databases much like a regular
1505          * backend, so we use the same signal handling.  See equivalent code in
1506          * tcop/postgres.c.
1507          *
1508          * Currently, we don't pay attention to postgresql.conf changes that
1509          * happen during a single daemon iteration, so we can ignore SIGHUP.
1510          */
1511         pqsignal(SIGHUP, SIG_IGN);
1512
1513         /*
1514          * SIGINT is used to signal canceling the current table's vacuum; SIGTERM
1515          * means abort and exit cleanly, and SIGQUIT means abandon ship.
1516          */
1517         pqsignal(SIGINT, StatementCancelHandler);
1518         pqsignal(SIGTERM, die);
1519         pqsignal(SIGQUIT, quickdie);
1520         InitializeTimeouts();           /* establishes SIGALRM handler */
1521
1522         pqsignal(SIGPIPE, SIG_IGN);
1523         pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1524         pqsignal(SIGUSR2, SIG_IGN);
1525         pqsignal(SIGFPE, FloatExceptionHandler);
1526         pqsignal(SIGCHLD, SIG_DFL);
1527
1528         /* Early initialization */
1529         BaseInit();
1530
1531         /*
1532          * Create a per-backend PGPROC struct in shared memory, except in the
1533          * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
1534          * this before we can use LWLocks (and in the EXEC_BACKEND case we already
1535          * had to do some stuff with LWLocks).
1536          */
1537 #ifndef EXEC_BACKEND
1538         InitProcess();
1539 #endif
1540
1541         /*
1542          * If an exception is encountered, processing resumes here.
1543          *
1544          * See notes in postgres.c about the design of this coding.
1545          */
1546         if (sigsetjmp(local_sigjmp_buf, 1) != 0)
1547         {
1548                 /* Prevents interrupts while cleaning up */
1549                 HOLD_INTERRUPTS();
1550
1551                 /* Report the error to the server log */
1552                 EmitErrorReport();
1553
1554                 /*
1555                  * We can now go away.  Note that because we called InitProcess, a
1556                  * callback was registered to do ProcKill, which will clean up
1557                  * necessary state.
1558                  */
1559                 proc_exit(0);
1560         }
1561
1562         /* We can now handle ereport(ERROR) */
1563         PG_exception_stack = &local_sigjmp_buf;
1564
1565         PG_SETMASK(&UnBlockSig);
1566
1567         /*
1568          * Force zero_damaged_pages OFF in the autovac process, even if it is set
1569          * in postgresql.conf.  We don't really want such a dangerous option being
1570          * applied non-interactively.
1571          */
1572         SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
1573
1574         /*
1575          * Force statement_timeout and lock_timeout to zero to avoid letting these
1576          * settings prevent regular maintenance from being executed.
1577          */
1578         SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1579         SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1580
1581         /*
1582          * Force default_transaction_isolation to READ COMMITTED.  We don't want
1583          * to pay the overhead of serializable mode, nor add any risk of causing
1584          * deadlocks or delaying other transactions.
1585          */
1586         SetConfigOption("default_transaction_isolation", "read committed",
1587                                         PGC_SUSET, PGC_S_OVERRIDE);
1588
1589         /*
1590          * Force synchronous replication off to allow regular maintenance even if
1591          * we are waiting for standbys to connect. This is important to ensure we
1592          * aren't blocked from performing anti-wraparound tasks.
1593          */
1594         if (synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH)
1595                 SetConfigOption("synchronous_commit", "local",
1596                                                 PGC_SUSET, PGC_S_OVERRIDE);
1597
1598         /*
1599          * Get the info about the database we're going to work on.
1600          */
1601         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1602
1603         /*
1604          * beware of startingWorker being INVALID; this should normally not
1605          * happen, but if a worker fails after forking and before this, the
1606          * launcher might have decided to remove it from the queue and start
1607          * again.
1608          */
1609         if (AutoVacuumShmem->av_startingWorker != NULL)
1610         {
1611                 MyWorkerInfo = AutoVacuumShmem->av_startingWorker;
1612                 dbid = MyWorkerInfo->wi_dboid;
1613                 MyWorkerInfo->wi_proc = MyProc;
1614
1615                 /* insert into the running list */
1616                 dlist_push_head(&AutoVacuumShmem->av_runningWorkers,
1617                                                 &MyWorkerInfo->wi_links);
1618
1619                 /*
1620                  * remove from the "starting" pointer, so that the launcher can start
1621                  * a new worker if required
1622                  */
1623                 AutoVacuumShmem->av_startingWorker = NULL;
1624                 LWLockRelease(AutovacuumLock);
1625
1626                 on_shmem_exit(FreeWorkerInfo, 0);
1627
1628                 /* wake up the launcher */
1629                 if (AutoVacuumShmem->av_launcherpid != 0)
1630                         kill(AutoVacuumShmem->av_launcherpid, SIGUSR2);
1631         }
1632         else
1633         {
1634                 /* no worker entry for me, go away */
1635                 elog(WARNING, "autovacuum worker started without a worker entry");
1636                 dbid = InvalidOid;
1637                 LWLockRelease(AutovacuumLock);
1638         }
1639
1640         if (OidIsValid(dbid))
1641         {
1642                 char            dbname[NAMEDATALEN];
1643
1644                 /*
1645                  * Report autovac startup to the stats collector.  We deliberately do
1646                  * this before InitPostgres, so that the last_autovac_time will get
1647                  * updated even if the connection attempt fails.  This is to prevent
1648                  * autovac from getting "stuck" repeatedly selecting an unopenable
1649                  * database, rather than making any progress on stuff it can connect
1650                  * to.
1651                  */
1652                 pgstat_report_autovac(dbid);
1653
1654                 /*
1655                  * Connect to the selected database
1656                  *
1657                  * Note: if we have selected a just-deleted database (due to using
1658                  * stale stats info), we'll fail and exit here.
1659                  */
1660                 InitPostgres(NULL, dbid, NULL, dbname);
1661                 SetProcessingMode(NormalProcessing);
1662                 set_ps_display(dbname, false);
1663                 ereport(DEBUG1,
1664                                 (errmsg("autovacuum: processing database \"%s\"", dbname)));
1665
1666                 if (PostAuthDelay)
1667                         pg_usleep(PostAuthDelay * 1000000L);
1668
1669                 /* And do an appropriate amount of work */
1670                 recentXid = ReadNewTransactionId();
1671                 recentMulti = ReadNextMultiXactId();
1672                 do_autovacuum();
1673         }
1674
1675         /*
1676          * The launcher will be notified of my death in ProcKill, *if* we managed
1677          * to get a worker slot at all
1678          */
1679
1680         /* All done, go away */
1681         proc_exit(0);
1682 }
1683
1684 /*
1685  * Return a WorkerInfo to the free list
1686  */
1687 static void
1688 FreeWorkerInfo(int code, Datum arg)
1689 {
1690         if (MyWorkerInfo != NULL)
1691         {
1692                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1693
1694                 /*
1695                  * Wake the launcher up so that he can launch a new worker immediately
1696                  * if required.  We only save the launcher's PID in local memory here;
1697                  * the actual signal will be sent when the PGPROC is recycled.  Note
1698                  * that we always do this, so that the launcher can rebalance the cost
1699                  * limit setting of the remaining workers.
1700                  *
1701                  * We somewhat ignore the risk that the launcher changes its PID
1702                  * between us reading it and the actual kill; we expect ProcKill to be
1703                  * called shortly after us, and we assume that PIDs are not reused too
1704                  * quickly after a process exits.
1705                  */
1706                 AutovacuumLauncherPid = AutoVacuumShmem->av_launcherpid;
1707
1708                 dlist_delete(&MyWorkerInfo->wi_links);
1709                 MyWorkerInfo->wi_dboid = InvalidOid;
1710                 MyWorkerInfo->wi_tableoid = InvalidOid;
1711                 MyWorkerInfo->wi_proc = NULL;
1712                 MyWorkerInfo->wi_launchtime = 0;
1713                 MyWorkerInfo->wi_cost_delay = 0;
1714                 MyWorkerInfo->wi_cost_limit = 0;
1715                 MyWorkerInfo->wi_cost_limit_base = 0;
1716                 dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
1717                                                 &MyWorkerInfo->wi_links);
1718                 /* not mine anymore */
1719                 MyWorkerInfo = NULL;
1720
1721                 /*
1722                  * now that we're inactive, cause a rebalancing of the surviving
1723                  * workers
1724                  */
1725                 AutoVacuumShmem->av_signal[AutoVacRebalance] = true;
1726                 LWLockRelease(AutovacuumLock);
1727         }
1728 }
1729
1730 /*
1731  * Update the cost-based delay parameters, so that multiple workers consume
1732  * each a fraction of the total available I/O.
1733  */
1734 void
1735 AutoVacuumUpdateDelay(void)
1736 {
1737         if (MyWorkerInfo)
1738         {
1739                 VacuumCostDelay = MyWorkerInfo->wi_cost_delay;
1740                 VacuumCostLimit = MyWorkerInfo->wi_cost_limit;
1741         }
1742 }
1743
1744 /*
1745  * autovac_balance_cost
1746  *              Recalculate the cost limit setting for each active worker.
1747  *
1748  * Caller must hold the AutovacuumLock in exclusive mode.
1749  */
1750 static void
1751 autovac_balance_cost(void)
1752 {
1753         /*
1754          * The idea here is that we ration out I/O equally.  The amount of I/O
1755          * that a worker can consume is determined by cost_limit/cost_delay, so we
1756          * try to equalize those ratios rather than the raw limit settings.
1757          *
1758          * note: in cost_limit, zero also means use value from elsewhere, because
1759          * zero is not a valid value.
1760          */
1761         int                     vac_cost_limit = (autovacuum_vac_cost_limit > 0 ?
1762                                                                 autovacuum_vac_cost_limit : VacuumCostLimit);
1763         int                     vac_cost_delay = (autovacuum_vac_cost_delay >= 0 ?
1764                                                                 autovacuum_vac_cost_delay : VacuumCostDelay);
1765         double          cost_total;
1766         double          cost_avail;
1767         dlist_iter      iter;
1768
1769         /* not set? nothing to do */
1770         if (vac_cost_limit <= 0 || vac_cost_delay <= 0)
1771                 return;
1772
1773         /* caculate the total base cost limit of active workers */
1774         cost_total = 0.0;
1775         dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1776         {
1777                 WorkerInfo      worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1778
1779                 if (worker->wi_proc != NULL &&
1780                         worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1781                         cost_total +=
1782                                 (double) worker->wi_cost_limit_base / worker->wi_cost_delay;
1783         }
1784         /* there are no cost limits -- nothing to do */
1785         if (cost_total <= 0)
1786                 return;
1787
1788         /*
1789          * Adjust cost limit of each active worker to balance the total of cost
1790          * limit to autovacuum_vacuum_cost_limit.
1791          */
1792         cost_avail = (double) vac_cost_limit / vac_cost_delay;
1793         dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1794         {
1795                 WorkerInfo      worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1796
1797                 if (worker->wi_proc != NULL &&
1798                         worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1799                 {
1800                         int                     limit = (int)
1801                         (cost_avail * worker->wi_cost_limit_base / cost_total);
1802
1803                         /*
1804                          * We put a lower bound of 1 on the cost_limit, to avoid division-
1805                          * by-zero in the vacuum code.  Also, in case of roundoff trouble
1806                          * in these calculations, let's be sure we don't ever set
1807                          * cost_limit to more than the base value.
1808                          */
1809                         worker->wi_cost_limit = Max(Min(limit,
1810                                                                                         worker->wi_cost_limit_base),
1811                                                                                 1);
1812
1813                         elog(DEBUG2, "autovac_balance_cost(pid=%u db=%u, rel=%u, cost_limit=%d, cost_limit_base=%d, cost_delay=%d)",
1814                                  worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid,
1815                                  worker->wi_cost_limit, worker->wi_cost_limit_base,
1816                                  worker->wi_cost_delay);
1817                 }
1818         }
1819 }
1820
1821 /*
1822  * get_database_list
1823  *              Return a list of all databases found in pg_database.
1824  *
1825  * The list and associated data is allocated in the caller's memory context,
1826  * which is in charge of ensuring that it's properly cleaned up afterwards.
1827  *
1828  * Note: this is the only function in which the autovacuum launcher uses a
1829  * transaction.  Although we aren't attached to any particular database and
1830  * therefore can't access most catalogs, we do have enough infrastructure
1831  * to do a seqscan on pg_database.
1832  */
1833 static List *
1834 get_database_list(void)
1835 {
1836         List       *dblist = NIL;
1837         Relation        rel;
1838         HeapScanDesc scan;
1839         HeapTuple       tup;
1840         MemoryContext resultcxt;
1841
1842         /* This is the context that we will allocate our output data in */
1843         resultcxt = CurrentMemoryContext;
1844
1845         /*
1846          * Start a transaction so we can access pg_database, and get a snapshot.
1847          * We don't have a use for the snapshot itself, but we're interested in
1848          * the secondary effect that it sets RecentGlobalXmin.  (This is critical
1849          * for anything that reads heap pages, because HOT may decide to prune
1850          * them even if the process doesn't attempt to modify any tuples.)
1851          */
1852         StartTransactionCommand();
1853         (void) GetTransactionSnapshot();
1854
1855         rel = heap_open(DatabaseRelationId, AccessShareLock);
1856         scan = heap_beginscan_catalog(rel, 0, NULL);
1857
1858         while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
1859         {
1860                 Form_pg_database pgdatabase = (Form_pg_database) GETSTRUCT(tup);
1861                 avw_dbase  *avdb;
1862                 MemoryContext oldcxt;
1863
1864                 /*
1865                  * Allocate our results in the caller's context, not the
1866                  * transaction's. We do this inside the loop, and restore the original
1867                  * context at the end, so that leaky things like heap_getnext() are
1868                  * not called in a potentially long-lived context.
1869                  */
1870                 oldcxt = MemoryContextSwitchTo(resultcxt);
1871
1872                 avdb = (avw_dbase *) palloc(sizeof(avw_dbase));
1873
1874                 avdb->adw_datid = HeapTupleGetOid(tup);
1875                 avdb->adw_name = pstrdup(NameStr(pgdatabase->datname));
1876                 avdb->adw_frozenxid = pgdatabase->datfrozenxid;
1877                 avdb->adw_minmulti = pgdatabase->datminmxid;
1878                 /* this gets set later: */
1879                 avdb->adw_entry = NULL;
1880
1881                 dblist = lappend(dblist, avdb);
1882                 MemoryContextSwitchTo(oldcxt);
1883         }
1884
1885         heap_endscan(scan);
1886         heap_close(rel, AccessShareLock);
1887
1888         CommitTransactionCommand();
1889
1890         return dblist;
1891 }
1892
1893 /*
1894  * Process a database table-by-table
1895  *
1896  * Note that CHECK_FOR_INTERRUPTS is supposed to be used in certain spots in
1897  * order not to ignore shutdown commands for too long.
1898  */
1899 static void
1900 do_autovacuum(void)
1901 {
1902         Relation        classRel;
1903         HeapTuple       tuple;
1904         HeapScanDesc relScan;
1905         Form_pg_database dbForm;
1906         List       *table_oids = NIL;
1907         HASHCTL         ctl;
1908         HTAB       *table_toast_map;
1909         ListCell   *volatile cell;
1910         PgStat_StatDBEntry *shared;
1911         PgStat_StatDBEntry *dbentry;
1912         BufferAccessStrategy bstrategy;
1913         ScanKeyData key;
1914         TupleDesc       pg_class_desc;
1915
1916         /*
1917          * StartTransactionCommand and CommitTransactionCommand will automatically
1918          * switch to other contexts.  We need this one to keep the list of
1919          * relations to vacuum/analyze across transactions.
1920          */
1921         AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
1922                                                                                   "AV worker",
1923                                                                                   ALLOCSET_DEFAULT_MINSIZE,
1924                                                                                   ALLOCSET_DEFAULT_INITSIZE,
1925                                                                                   ALLOCSET_DEFAULT_MAXSIZE);
1926         MemoryContextSwitchTo(AutovacMemCxt);
1927
1928         /*
1929          * may be NULL if we couldn't find an entry (only happens if we are
1930          * forcing a vacuum for anti-wrap purposes).
1931          */
1932         dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
1933
1934         /* Start a transaction so our commands have one to play into. */
1935         StartTransactionCommand();
1936
1937         /*
1938          * Clean up any dead statistics collector entries for this DB. We always
1939          * want to do this exactly once per DB-processing cycle, even if we find
1940          * nothing worth vacuuming in the database.
1941          */
1942         pgstat_vacuum_stat();
1943
1944         /*
1945          * Find the pg_database entry and select the default freeze ages. We use
1946          * zero in template and nonconnectable databases, else the system-wide
1947          * default.
1948          */
1949         tuple = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
1950         if (!HeapTupleIsValid(tuple))
1951                 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
1952         dbForm = (Form_pg_database) GETSTRUCT(tuple);
1953
1954         if (dbForm->datistemplate || !dbForm->datallowconn)
1955         {
1956                 default_freeze_min_age = 0;
1957                 default_freeze_table_age = 0;
1958         }
1959         else
1960         {
1961                 default_freeze_min_age = vacuum_freeze_min_age;
1962                 default_freeze_table_age = vacuum_freeze_table_age;
1963         }
1964
1965         ReleaseSysCache(tuple);
1966
1967         /* StartTransactionCommand changed elsewhere */
1968         MemoryContextSwitchTo(AutovacMemCxt);
1969
1970         /* The database hash where pgstat keeps shared relations */
1971         shared = pgstat_fetch_stat_dbentry(InvalidOid);
1972
1973         classRel = heap_open(RelationRelationId, AccessShareLock);
1974
1975         /* create a copy so we can use it after closing pg_class */
1976         pg_class_desc = CreateTupleDescCopy(RelationGetDescr(classRel));
1977
1978         /* create hash table for toast <-> main relid mapping */
1979         MemSet(&ctl, 0, sizeof(ctl));
1980         ctl.keysize = sizeof(Oid);
1981         ctl.entrysize = sizeof(av_relation);
1982         ctl.hash = oid_hash;
1983
1984         table_toast_map = hash_create("TOAST to main relid map",
1985                                                                   100,
1986                                                                   &ctl,
1987                                                                   HASH_ELEM | HASH_FUNCTION);
1988
1989         /*
1990          * Scan pg_class to determine which tables to vacuum.
1991          *
1992          * We do this in two passes: on the first one we collect the list of plain
1993          * relations and materialized views, and on the second one we collect
1994          * TOAST tables. The reason for doing the second pass is that during it we
1995          * want to use the main relation's pg_class.reloptions entry if the TOAST
1996          * table does not have any, and we cannot obtain it unless we know
1997          * beforehand what's the main  table OID.
1998          *
1999          * We need to check TOAST tables separately because in cases with short,
2000          * wide tables there might be proportionally much more activity in the
2001          * TOAST table than in its parent.
2002          */
2003         relScan = heap_beginscan_catalog(classRel, 0, NULL);
2004
2005         /*
2006          * On the first pass, we collect main tables to vacuum, and also the main
2007          * table relid to TOAST relid mapping.
2008          */
2009         while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2010         {
2011                 Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2012                 PgStat_StatTabEntry *tabentry;
2013                 AutoVacOpts *relopts;
2014                 Oid                     relid;
2015                 bool            dovacuum;
2016                 bool            doanalyze;
2017                 bool            wraparound;
2018
2019                 if (classForm->relkind != RELKIND_RELATION &&
2020                         classForm->relkind != RELKIND_MATVIEW)
2021                         continue;
2022
2023                 relid = HeapTupleGetOid(tuple);
2024
2025                 /* Fetch reloptions and the pgstat entry for this table */
2026                 relopts = extract_autovac_opts(tuple, pg_class_desc);
2027                 tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2028                                                                                          shared, dbentry);
2029
2030                 /* Check if it needs vacuum or analyze */
2031                 relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2032                                                                   &dovacuum, &doanalyze, &wraparound);
2033
2034                 /*
2035                  * Check if it is a temp table (presumably, of some other backend's).
2036                  * We cannot safely process other backends' temp tables.
2037                  */
2038                 if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2039                 {
2040                         int                     backendID;
2041
2042                         backendID = GetTempNamespaceBackendId(classForm->relnamespace);
2043
2044                         /* We just ignore it if the owning backend is still active */
2045                         if (backendID == MyBackendId || BackendIdGetProc(backendID) == NULL)
2046                         {
2047                                 /*
2048                                  * We found an orphan temp table (which was probably left
2049                                  * behind by a crashed backend).  If it's so old as to need
2050                                  * vacuum for wraparound, forcibly drop it.  Otherwise just
2051                                  * log a complaint.
2052                                  */
2053                                 if (wraparound)
2054                                 {
2055                                         ObjectAddress object;
2056
2057                                         ereport(LOG,
2058                                                         (errmsg("autovacuum: dropping orphan temp table \"%s\".\"%s\" in database \"%s\"",
2059                                                                  get_namespace_name(classForm->relnamespace),
2060                                                                         NameStr(classForm->relname),
2061                                                                         get_database_name(MyDatabaseId))));
2062                                         object.classId = RelationRelationId;
2063                                         object.objectId = relid;
2064                                         object.objectSubId = 0;
2065                                         performDeletion(&object, DROP_CASCADE, PERFORM_DELETION_INTERNAL);
2066                                 }
2067                                 else
2068                                 {
2069                                         ereport(LOG,
2070                                                         (errmsg("autovacuum: found orphan temp table \"%s\".\"%s\" in database \"%s\"",
2071                                                                  get_namespace_name(classForm->relnamespace),
2072                                                                         NameStr(classForm->relname),
2073                                                                         get_database_name(MyDatabaseId))));
2074                                 }
2075                         }
2076                 }
2077                 else
2078                 {
2079                         /* relations that need work are added to table_oids */
2080                         if (dovacuum || doanalyze)
2081                                 table_oids = lappend_oid(table_oids, relid);
2082
2083                         /*
2084                          * Remember the association for the second pass.  Note: we must do
2085                          * this even if the table is going to be vacuumed, because we
2086                          * don't automatically vacuum toast tables along the parent table.
2087                          */
2088                         if (OidIsValid(classForm->reltoastrelid))
2089                         {
2090                                 av_relation *hentry;
2091                                 bool            found;
2092
2093                                 hentry = hash_search(table_toast_map,
2094                                                                          &classForm->reltoastrelid,
2095                                                                          HASH_ENTER, &found);
2096
2097                                 if (!found)
2098                                 {
2099                                         /* hash_search already filled in the key */
2100                                         hentry->ar_relid = relid;
2101                                         hentry->ar_hasrelopts = false;
2102                                         if (relopts != NULL)
2103                                         {
2104                                                 hentry->ar_hasrelopts = true;
2105                                                 memcpy(&hentry->ar_reloptions, relopts,
2106                                                            sizeof(AutoVacOpts));
2107                                         }
2108                                 }
2109                         }
2110                 }
2111         }
2112
2113         heap_endscan(relScan);
2114
2115         /* second pass: check TOAST tables */
2116         ScanKeyInit(&key,
2117                                 Anum_pg_class_relkind,
2118                                 BTEqualStrategyNumber, F_CHAREQ,
2119                                 CharGetDatum(RELKIND_TOASTVALUE));
2120
2121         relScan = heap_beginscan_catalog(classRel, 1, &key);
2122         while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2123         {
2124                 Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2125                 PgStat_StatTabEntry *tabentry;
2126                 Oid                     relid;
2127                 AutoVacOpts *relopts = NULL;
2128                 bool            dovacuum;
2129                 bool            doanalyze;
2130                 bool            wraparound;
2131
2132                 /*
2133                  * We cannot safely process other backends' temp tables, so skip 'em.
2134                  */
2135                 if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2136                         continue;
2137
2138                 relid = HeapTupleGetOid(tuple);
2139
2140                 /*
2141                  * fetch reloptions -- if this toast table does not have them, try the
2142                  * main rel
2143                  */
2144                 relopts = extract_autovac_opts(tuple, pg_class_desc);
2145                 if (relopts == NULL)
2146                 {
2147                         av_relation *hentry;
2148                         bool            found;
2149
2150                         hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2151                         if (found && hentry->ar_hasrelopts)
2152                                 relopts = &hentry->ar_reloptions;
2153                 }
2154
2155                 /* Fetch the pgstat entry for this table */
2156                 tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2157                                                                                          shared, dbentry);
2158
2159                 relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2160                                                                   &dovacuum, &doanalyze, &wraparound);
2161
2162                 /* ignore analyze for toast tables */
2163                 if (dovacuum)
2164                         table_oids = lappend_oid(table_oids, relid);
2165         }
2166
2167         heap_endscan(relScan);
2168         heap_close(classRel, AccessShareLock);
2169
2170         /*
2171          * Create a buffer access strategy object for VACUUM to use.  We want to
2172          * use the same one across all the vacuum operations we perform, since the
2173          * point is for VACUUM not to blow out the shared cache.
2174          */
2175         bstrategy = GetAccessStrategy(BAS_VACUUM);
2176
2177         /*
2178          * create a memory context to act as fake PortalContext, so that the
2179          * contexts created in the vacuum code are cleaned up for each table.
2180          */
2181         PortalContext = AllocSetContextCreate(AutovacMemCxt,
2182                                                                                   "Autovacuum Portal",
2183                                                                                   ALLOCSET_DEFAULT_INITSIZE,
2184                                                                                   ALLOCSET_DEFAULT_MINSIZE,
2185                                                                                   ALLOCSET_DEFAULT_MAXSIZE);
2186
2187         /*
2188          * Perform operations on collected tables.
2189          */
2190         foreach(cell, table_oids)
2191         {
2192                 Oid                     relid = lfirst_oid(cell);
2193                 autovac_table *tab;
2194                 bool            skipit;
2195                 int                     stdVacuumCostDelay;
2196                 int                     stdVacuumCostLimit;
2197                 dlist_iter      iter;
2198
2199                 CHECK_FOR_INTERRUPTS();
2200
2201                 /*
2202                  * hold schedule lock from here until we're sure that this table still
2203                  * needs vacuuming.  We also need the AutovacuumLock to walk the
2204                  * worker array, but we'll let go of that one quickly.
2205                  */
2206                 LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2207                 LWLockAcquire(AutovacuumLock, LW_SHARED);
2208
2209                 /*
2210                  * Check whether the table is being vacuumed concurrently by another
2211                  * worker.
2212                  */
2213                 skipit = false;
2214                 dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
2215                 {
2216                         WorkerInfo      worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
2217
2218                         /* ignore myself */
2219                         if (worker == MyWorkerInfo)
2220                                 continue;
2221
2222                         /* ignore workers in other databases */
2223                         if (worker->wi_dboid != MyDatabaseId)
2224                                 continue;
2225
2226                         if (worker->wi_tableoid == relid)
2227                         {
2228                                 skipit = true;
2229                                 break;
2230                         }
2231                 }
2232                 LWLockRelease(AutovacuumLock);
2233                 if (skipit)
2234                 {
2235                         LWLockRelease(AutovacuumScheduleLock);
2236                         continue;
2237                 }
2238
2239                 /*
2240                  * Check whether pgstat data still says we need to vacuum this table.
2241                  * It could have changed if something else processed the table while
2242                  * we weren't looking.
2243                  *
2244                  * Note: we have a special case in pgstat code to ensure that the
2245                  * stats we read are as up-to-date as possible, to avoid the problem
2246                  * that somebody just finished vacuuming this table.  The window to
2247                  * the race condition is not closed but it is very small.
2248                  */
2249                 MemoryContextSwitchTo(AutovacMemCxt);
2250                 tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc);
2251                 if (tab == NULL)
2252                 {
2253                         /* someone else vacuumed the table, or it went away */
2254                         LWLockRelease(AutovacuumScheduleLock);
2255                         continue;
2256                 }
2257
2258                 /*
2259                  * Ok, good to go.      Store the table in shared memory before releasing
2260                  * the lock so that other workers don't vacuum it concurrently.
2261                  */
2262                 MyWorkerInfo->wi_tableoid = relid;
2263                 LWLockRelease(AutovacuumScheduleLock);
2264
2265                 /*
2266                  * Remember the prevailing values of the vacuum cost GUCs.      We have to
2267                  * restore these at the bottom of the loop, else we'll compute wrong
2268                  * values in the next iteration of autovac_balance_cost().
2269                  */
2270                 stdVacuumCostDelay = VacuumCostDelay;
2271                 stdVacuumCostLimit = VacuumCostLimit;
2272
2273                 /* Must hold AutovacuumLock while mucking with cost balance info */
2274                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2275
2276                 /* advertise my cost delay parameters for the balancing algorithm */
2277                 MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay;
2278                 MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit;
2279                 MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit;
2280
2281                 /* do a balance */
2282                 autovac_balance_cost();
2283
2284                 /* set the active cost parameters from the result of that */
2285                 AutoVacuumUpdateDelay();
2286
2287                 /* done */
2288                 LWLockRelease(AutovacuumLock);
2289
2290                 /* clean up memory before each iteration */
2291                 MemoryContextResetAndDeleteChildren(PortalContext);
2292
2293                 /*
2294                  * Save the relation name for a possible error message, to avoid a
2295                  * catalog lookup in case of an error.  If any of these return NULL,
2296                  * then the relation has been dropped since last we checked; skip it.
2297                  * Note: they must live in a long-lived memory context because we call
2298                  * vacuum and analyze in different transactions.
2299                  */
2300
2301                 tab->at_relname = get_rel_name(tab->at_relid);
2302                 tab->at_nspname = get_namespace_name(get_rel_namespace(tab->at_relid));
2303                 tab->at_datname = get_database_name(MyDatabaseId);
2304                 if (!tab->at_relname || !tab->at_nspname || !tab->at_datname)
2305                         goto deleted;
2306
2307                 /*
2308                  * We will abort vacuuming the current table if something errors out,
2309                  * and continue with the next one in schedule; in particular, this
2310                  * happens if we are interrupted with SIGINT.
2311                  */
2312                 PG_TRY();
2313                 {
2314                         /* have at it */
2315                         MemoryContextSwitchTo(TopTransactionContext);
2316                         autovacuum_do_vac_analyze(tab, bstrategy);
2317
2318                         /*
2319                          * Clear a possible query-cancel signal, to avoid a late reaction
2320                          * to an automatically-sent signal because of vacuuming the
2321                          * current table (we're done with it, so it would make no sense to
2322                          * cancel at this point.)
2323                          */
2324                         QueryCancelPending = false;
2325                 }
2326                 PG_CATCH();
2327                 {
2328                         /*
2329                          * Abort the transaction, start a new one, and proceed with the
2330                          * next table in our list.
2331                          */
2332                         HOLD_INTERRUPTS();
2333                         if (tab->at_dovacuum)
2334                                 errcontext("automatic vacuum of table \"%s.%s.%s\"",
2335                                                    tab->at_datname, tab->at_nspname, tab->at_relname);
2336                         else
2337                                 errcontext("automatic analyze of table \"%s.%s.%s\"",
2338                                                    tab->at_datname, tab->at_nspname, tab->at_relname);
2339                         EmitErrorReport();
2340
2341                         /* this resets the PGXACT flags too */
2342                         AbortOutOfAnyTransaction();
2343                         FlushErrorState();
2344                         MemoryContextResetAndDeleteChildren(PortalContext);
2345
2346                         /* restart our transaction for the following operations */
2347                         StartTransactionCommand();
2348                         RESUME_INTERRUPTS();
2349                 }
2350                 PG_END_TRY();
2351
2352                 /* the PGXACT flags are reset at the next end of transaction */
2353
2354                 /* be tidy */
2355 deleted:
2356                 if (tab->at_datname != NULL)
2357                         pfree(tab->at_datname);
2358                 if (tab->at_nspname != NULL)
2359                         pfree(tab->at_nspname);
2360                 if (tab->at_relname != NULL)
2361                         pfree(tab->at_relname);
2362                 pfree(tab);
2363
2364                 /*
2365                  * Remove my info from shared memory.  We could, but intentionally
2366                  * don't, clear wi_cost_limit and friends --- this is on the
2367                  * assumption that we probably have more to do with similar cost
2368                  * settings, so we don't want to give up our share of I/O for a very
2369                  * short interval and thereby thrash the global balance.
2370                  */
2371                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2372                 MyWorkerInfo->wi_tableoid = InvalidOid;
2373                 LWLockRelease(AutovacuumLock);
2374
2375                 /* restore vacuum cost GUCs for the next iteration */
2376                 VacuumCostDelay = stdVacuumCostDelay;
2377                 VacuumCostLimit = stdVacuumCostLimit;
2378         }
2379
2380         /*
2381          * We leak table_toast_map here (among other things), but since we're
2382          * going away soon, it's not a problem.
2383          */
2384
2385         /*
2386          * Update pg_database.datfrozenxid, and truncate pg_clog if possible. We
2387          * only need to do this once, not after each table.
2388          */
2389         vac_update_datfrozenxid();
2390
2391         /* Finally close out the last transaction. */
2392         CommitTransactionCommand();
2393 }
2394
2395 /*
2396  * extract_autovac_opts
2397  *
2398  * Given a relation's pg_class tuple, return the AutoVacOpts portion of
2399  * reloptions, if set; otherwise, return NULL.
2400  */
2401 static AutoVacOpts *
2402 extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc)
2403 {
2404         bytea      *relopts;
2405         AutoVacOpts *av;
2406
2407         Assert(((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_RELATION ||
2408                    ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW ||
2409                    ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE);
2410
2411         relopts = extractRelOptions(tup, pg_class_desc, InvalidOid);
2412         if (relopts == NULL)
2413                 return NULL;
2414
2415         av = palloc(sizeof(AutoVacOpts));
2416         memcpy(av, &(((StdRdOptions *) relopts)->autovacuum), sizeof(AutoVacOpts));
2417         pfree(relopts);
2418
2419         return av;
2420 }
2421
2422 /*
2423  * get_pgstat_tabentry_relid
2424  *
2425  * Fetch the pgstat entry of a table, either local to a database or shared.
2426  */
2427 static PgStat_StatTabEntry *
2428 get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
2429                                                   PgStat_StatDBEntry *dbentry)
2430 {
2431         PgStat_StatTabEntry *tabentry = NULL;
2432
2433         if (isshared)
2434         {
2435                 if (PointerIsValid(shared))
2436                         tabentry = hash_search(shared->tables, &relid,
2437                                                                    HASH_FIND, NULL);
2438         }
2439         else if (PointerIsValid(dbentry))
2440                 tabentry = hash_search(dbentry->tables, &relid,
2441                                                            HASH_FIND, NULL);
2442
2443         return tabentry;
2444 }
2445
2446 /*
2447  * table_recheck_autovac
2448  *
2449  * Recheck whether a table still needs vacuum or analyze.  Return value is a
2450  * valid autovac_table pointer if it does, NULL otherwise.
2451  *
2452  * Note that the returned autovac_table does not have the name fields set.
2453  */
2454 static autovac_table *
2455 table_recheck_autovac(Oid relid, HTAB *table_toast_map,
2456                                           TupleDesc pg_class_desc)
2457 {
2458         Form_pg_class classForm;
2459         HeapTuple       classTup;
2460         bool            dovacuum;
2461         bool            doanalyze;
2462         autovac_table *tab = NULL;
2463         PgStat_StatTabEntry *tabentry;
2464         PgStat_StatDBEntry *shared;
2465         PgStat_StatDBEntry *dbentry;
2466         bool            wraparound;
2467         AutoVacOpts *avopts;
2468
2469         /* use fresh stats */
2470         autovac_refresh_stats();
2471
2472         shared = pgstat_fetch_stat_dbentry(InvalidOid);
2473         dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
2474
2475         /* fetch the relation's relcache entry */
2476         classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
2477         if (!HeapTupleIsValid(classTup))
2478                 return NULL;
2479         classForm = (Form_pg_class) GETSTRUCT(classTup);
2480
2481         /*
2482          * Get the applicable reloptions.  If it is a TOAST table, try to get the
2483          * main table reloptions if the toast table itself doesn't have.
2484          */
2485         avopts = extract_autovac_opts(classTup, pg_class_desc);
2486         if (classForm->relkind == RELKIND_TOASTVALUE &&
2487                 avopts == NULL && table_toast_map != NULL)
2488         {
2489                 av_relation *hentry;
2490                 bool            found;
2491
2492                 hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2493                 if (found && hentry->ar_hasrelopts)
2494                         avopts = &hentry->ar_reloptions;
2495         }
2496
2497         /* fetch the pgstat table entry */
2498         tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2499                                                                                  shared, dbentry);
2500
2501         relation_needs_vacanalyze(relid, avopts, classForm, tabentry,
2502                                                           &dovacuum, &doanalyze, &wraparound);
2503
2504         /* ignore ANALYZE for toast tables */
2505         if (classForm->relkind == RELKIND_TOASTVALUE)
2506                 doanalyze = false;
2507
2508         /* OK, it needs something done */
2509         if (doanalyze || dovacuum)
2510         {
2511                 int                     freeze_min_age;
2512                 int                     freeze_table_age;
2513                 int                     vac_cost_limit;
2514                 int                     vac_cost_delay;
2515
2516                 /*
2517                  * Calculate the vacuum cost parameters and the freeze ages.  If there
2518                  * are options set in pg_class.reloptions, use them; in the case of a
2519                  * toast table, try the main table too.  Otherwise use the GUC
2520                  * defaults, autovacuum's own first and plain vacuum second.
2521                  */
2522
2523                 /* -1 in autovac setting means use plain vacuum_cost_delay */
2524                 vac_cost_delay = (avopts && avopts->vacuum_cost_delay >= 0)
2525                         ? avopts->vacuum_cost_delay
2526                         : (autovacuum_vac_cost_delay >= 0)
2527                         ? autovacuum_vac_cost_delay
2528                         : VacuumCostDelay;
2529
2530                 /* 0 or -1 in autovac setting means use plain vacuum_cost_limit */
2531                 vac_cost_limit = (avopts && avopts->vacuum_cost_limit > 0)
2532                         ? avopts->vacuum_cost_limit
2533                         : (autovacuum_vac_cost_limit > 0)
2534                         ? autovacuum_vac_cost_limit
2535                         : VacuumCostLimit;
2536
2537                 /* these do not have autovacuum-specific settings */
2538                 freeze_min_age = (avopts && avopts->freeze_min_age >= 0)
2539                         ? avopts->freeze_min_age
2540                         : default_freeze_min_age;
2541
2542                 freeze_table_age = (avopts && avopts->freeze_table_age >= 0)
2543                         ? avopts->freeze_table_age
2544                         : default_freeze_table_age;
2545
2546                 tab = palloc(sizeof(autovac_table));
2547                 tab->at_relid = relid;
2548                 tab->at_dovacuum = dovacuum;
2549                 tab->at_doanalyze = doanalyze;
2550                 tab->at_freeze_min_age = freeze_min_age;
2551                 tab->at_freeze_table_age = freeze_table_age;
2552                 tab->at_vacuum_cost_limit = vac_cost_limit;
2553                 tab->at_vacuum_cost_delay = vac_cost_delay;
2554                 tab->at_wraparound = wraparound;
2555                 tab->at_relname = NULL;
2556                 tab->at_nspname = NULL;
2557                 tab->at_datname = NULL;
2558         }
2559
2560         heap_freetuple(classTup);
2561
2562         return tab;
2563 }
2564
2565 /*
2566  * relation_needs_vacanalyze
2567  *
2568  * Check whether a relation needs to be vacuumed or analyzed; return each into
2569  * "dovacuum" and "doanalyze", respectively.  Also return whether the vacuum is
2570  * being forced because of Xid wraparound.
2571  *
2572  * relopts is a pointer to the AutoVacOpts options (either for itself in the
2573  * case of a plain table, or for either itself or its parent table in the case
2574  * of a TOAST table), NULL if none; tabentry is the pgstats entry, which can be
2575  * NULL.
2576  *
2577  * A table needs to be vacuumed if the number of dead tuples exceeds a
2578  * threshold.  This threshold is calculated as
2579  *
2580  * threshold = vac_base_thresh + vac_scale_factor * reltuples
2581  *
2582  * For analyze, the analysis done is that the number of tuples inserted,
2583  * deleted and updated since the last analyze exceeds a threshold calculated
2584  * in the same fashion as above.  Note that the collector actually stores
2585  * the number of tuples (both live and dead) that there were as of the last
2586  * analyze.  This is asymmetric to the VACUUM case.
2587  *
2588  * We also force vacuum if the table's relfrozenxid is more than freeze_max_age
2589  * transactions back.
2590  *
2591  * A table whose autovacuum_enabled option is false is
2592  * automatically skipped (unless we have to vacuum it due to freeze_max_age).
2593  * Thus autovacuum can be disabled for specific tables. Also, when the stats
2594  * collector does not have data about a table, it will be skipped.
2595  *
2596  * A table whose vac_base_thresh value is < 0 takes the base value from the
2597  * autovacuum_vacuum_threshold GUC variable.  Similarly, a vac_scale_factor
2598  * value < 0 is substituted with the value of
2599  * autovacuum_vacuum_scale_factor GUC variable.  Ditto for analyze.
2600  */
2601 static void
2602 relation_needs_vacanalyze(Oid relid,
2603                                                   AutoVacOpts *relopts,
2604                                                   Form_pg_class classForm,
2605                                                   PgStat_StatTabEntry *tabentry,
2606  /* output params below */
2607                                                   bool *dovacuum,
2608                                                   bool *doanalyze,
2609                                                   bool *wraparound)
2610 {
2611         bool            force_vacuum;
2612         bool            av_enabled;
2613         float4          reltuples;              /* pg_class.reltuples */
2614
2615         /* constants from reloptions or GUC variables */
2616         int                     vac_base_thresh,
2617                                 anl_base_thresh;
2618         float4          vac_scale_factor,
2619                                 anl_scale_factor;
2620
2621         /* thresholds calculated from above constants */
2622         float4          vacthresh,
2623                                 anlthresh;
2624
2625         /* number of vacuum (resp. analyze) tuples at this time */
2626         float4          vactuples,
2627                                 anltuples;
2628
2629         /* freeze parameters */
2630         int                     freeze_max_age;
2631         TransactionId xidForceLimit;
2632         MultiXactId multiForceLimit;
2633
2634         AssertArg(classForm != NULL);
2635         AssertArg(OidIsValid(relid));
2636
2637         /*
2638          * Determine vacuum/analyze equation parameters.  We have two possible
2639          * sources: the passed reloptions (which could be a main table or a toast
2640          * table), or the autovacuum GUC variables.
2641          */
2642
2643         /* -1 in autovac setting means use plain vacuum_cost_delay */
2644         vac_scale_factor = (relopts && relopts->vacuum_scale_factor >= 0)
2645                 ? relopts->vacuum_scale_factor
2646                 : autovacuum_vac_scale;
2647
2648         vac_base_thresh = (relopts && relopts->vacuum_threshold >= 0)
2649                 ? relopts->vacuum_threshold
2650                 : autovacuum_vac_thresh;
2651
2652         anl_scale_factor = (relopts && relopts->analyze_scale_factor >= 0)
2653                 ? relopts->analyze_scale_factor
2654                 : autovacuum_anl_scale;
2655
2656         anl_base_thresh = (relopts && relopts->analyze_threshold >= 0)
2657                 ? relopts->analyze_threshold
2658                 : autovacuum_anl_thresh;
2659
2660         freeze_max_age = (relopts && relopts->freeze_max_age >= 0)
2661                 ? Min(relopts->freeze_max_age, autovacuum_freeze_max_age)
2662                 : autovacuum_freeze_max_age;
2663
2664         av_enabled = (relopts ? relopts->enabled : true);
2665
2666         /* Force vacuum if table is at risk of wraparound */
2667         xidForceLimit = recentXid - freeze_max_age;
2668         if (xidForceLimit < FirstNormalTransactionId)
2669                 xidForceLimit -= FirstNormalTransactionId;
2670         force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) &&
2671                                         TransactionIdPrecedes(classForm->relfrozenxid,
2672                                                                                   xidForceLimit));
2673         if (!force_vacuum)
2674         {
2675                 multiForceLimit = recentMulti - autovacuum_freeze_max_age;
2676                 if (multiForceLimit < FirstMultiXactId)
2677                         multiForceLimit -= FirstMultiXactId;
2678                 force_vacuum = MultiXactIdPrecedes(classForm->relminmxid,
2679                                                                                    multiForceLimit);
2680         }
2681         *wraparound = force_vacuum;
2682
2683         /* User disabled it in pg_class.reloptions?  (But ignore if at risk) */
2684         if (!force_vacuum && !av_enabled)
2685         {
2686                 *doanalyze = false;
2687                 *dovacuum = false;
2688                 return;
2689         }
2690
2691         if (PointerIsValid(tabentry))
2692         {
2693                 reltuples = classForm->reltuples;
2694                 vactuples = tabentry->n_dead_tuples;
2695                 anltuples = tabentry->changes_since_analyze;
2696
2697                 vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples;
2698                 anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples;
2699
2700                 /*
2701                  * Note that we don't need to take special consideration for stat
2702                  * reset, because if that happens, the last vacuum and analyze counts
2703                  * will be reset too.
2704                  */
2705                 elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), anl: %.0f (threshold %.0f)",
2706                          NameStr(classForm->relname),
2707                          vactuples, vacthresh, anltuples, anlthresh);
2708
2709                 /* Determine if this table needs vacuum or analyze. */
2710                 *dovacuum = force_vacuum || (vactuples > vacthresh);
2711                 *doanalyze = (anltuples > anlthresh);
2712         }
2713         else
2714         {
2715                 /*
2716                  * Skip a table not found in stat hash, unless we have to force vacuum
2717                  * for anti-wrap purposes.      If it's not acted upon, there's no need to
2718                  * vacuum it.
2719                  */
2720                 *dovacuum = force_vacuum;
2721                 *doanalyze = false;
2722         }
2723
2724         /* ANALYZE refuses to work with pg_statistics */
2725         if (relid == StatisticRelationId)
2726                 *doanalyze = false;
2727 }
2728
2729 /*
2730  * autovacuum_do_vac_analyze
2731  *              Vacuum and/or analyze the specified table
2732  */
2733 static void
2734 autovacuum_do_vac_analyze(autovac_table *tab,
2735                                                   BufferAccessStrategy bstrategy)
2736 {
2737         VacuumStmt      vacstmt;
2738         RangeVar        rangevar;
2739
2740         /* Set up command parameters --- use local variables instead of palloc */
2741         MemSet(&vacstmt, 0, sizeof(vacstmt));
2742         MemSet(&rangevar, 0, sizeof(rangevar));
2743
2744         rangevar.schemaname = tab->at_nspname;
2745         rangevar.relname = tab->at_relname;
2746         rangevar.location = -1;
2747
2748         vacstmt.type = T_VacuumStmt;
2749         if (!tab->at_wraparound)
2750                 vacstmt.options = VACOPT_NOWAIT;
2751         if (tab->at_dovacuum)
2752                 vacstmt.options |= VACOPT_VACUUM;
2753         if (tab->at_doanalyze)
2754                 vacstmt.options |= VACOPT_ANALYZE;
2755         vacstmt.freeze_min_age = tab->at_freeze_min_age;
2756         vacstmt.freeze_table_age = tab->at_freeze_table_age;
2757         /* we pass the OID, but might need this anyway for an error message */
2758         vacstmt.relation = &rangevar;
2759         vacstmt.va_cols = NIL;
2760
2761         /* Let pgstat know what we're doing */
2762         autovac_report_activity(tab);
2763
2764         vacuum(&vacstmt, tab->at_relid, false, bstrategy, tab->at_wraparound, true);
2765 }
2766
2767 /*
2768  * autovac_report_activity
2769  *              Report to pgstat what autovacuum is doing
2770  *
2771  * We send a SQL string corresponding to what the user would see if the
2772  * equivalent command was to be issued manually.
2773  *
2774  * Note we assume that we are going to report the next command as soon as we're
2775  * done with the current one, and exit right after the last one, so we don't
2776  * bother to report "<IDLE>" or some such.
2777  */
2778 static void
2779 autovac_report_activity(autovac_table *tab)
2780 {
2781 #define MAX_AUTOVAC_ACTIV_LEN (NAMEDATALEN * 2 + 56)
2782         char            activity[MAX_AUTOVAC_ACTIV_LEN];
2783         int                     len;
2784
2785         /* Report the command and possible options */
2786         if (tab->at_dovacuum)
2787                 snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
2788                                  "autovacuum: VACUUM%s",
2789                                  tab->at_doanalyze ? " ANALYZE" : "");
2790         else
2791                 snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
2792                                  "autovacuum: ANALYZE");
2793
2794         /*
2795          * Report the qualified name of the relation.
2796          */
2797         len = strlen(activity);
2798
2799         snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
2800                          " %s.%s%s", tab->at_nspname, tab->at_relname,
2801                          tab->at_wraparound ? " (to prevent wraparound)" : "");
2802
2803         /* Set statement_timestamp() to current time for pg_stat_activity */
2804         SetCurrentStatementStartTimestamp();
2805
2806         pgstat_report_activity(STATE_RUNNING, activity);
2807 }
2808
2809 /*
2810  * AutoVacuumingActive
2811  *              Check GUC vars and report whether the autovacuum process should be
2812  *              running.
2813  */
2814 bool
2815 AutoVacuumingActive(void)
2816 {
2817         if (!autovacuum_start_daemon || !pgstat_track_counts)
2818                 return false;
2819         return true;
2820 }
2821
2822 /*
2823  * autovac_init
2824  *              This is called at postmaster initialization.
2825  *
2826  * All we do here is annoy the user if he got it wrong.
2827  */
2828 void
2829 autovac_init(void)
2830 {
2831         if (autovacuum_start_daemon && !pgstat_track_counts)
2832                 ereport(WARNING,
2833                                 (errmsg("autovacuum not started because of misconfiguration"),
2834                                  errhint("Enable the \"track_counts\" option.")));
2835 }
2836
2837 /*
2838  * IsAutoVacuum functions
2839  *              Return whether this is either a launcher autovacuum process or a worker
2840  *              process.
2841  */
2842 bool
2843 IsAutoVacuumLauncherProcess(void)
2844 {
2845         return am_autovacuum_launcher;
2846 }
2847
2848 bool
2849 IsAutoVacuumWorkerProcess(void)
2850 {
2851         return am_autovacuum_worker;
2852 }
2853
2854
2855 /*
2856  * AutoVacuumShmemSize
2857  *              Compute space needed for autovacuum-related shared memory
2858  */
2859 Size
2860 AutoVacuumShmemSize(void)
2861 {
2862         Size            size;
2863
2864         /*
2865          * Need the fixed struct and the array of WorkerInfoData.
2866          */
2867         size = sizeof(AutoVacuumShmemStruct);
2868         size = MAXALIGN(size);
2869         size = add_size(size, mul_size(autovacuum_max_workers,
2870                                                                    sizeof(WorkerInfoData)));
2871         return size;
2872 }
2873
2874 /*
2875  * AutoVacuumShmemInit
2876  *              Allocate and initialize autovacuum-related shared memory
2877  */
2878 void
2879 AutoVacuumShmemInit(void)
2880 {
2881         bool            found;
2882
2883         AutoVacuumShmem = (AutoVacuumShmemStruct *)
2884                 ShmemInitStruct("AutoVacuum Data",
2885                                                 AutoVacuumShmemSize(),
2886                                                 &found);
2887
2888         if (!IsUnderPostmaster)
2889         {
2890                 WorkerInfo      worker;
2891                 int                     i;
2892
2893                 Assert(!found);
2894
2895                 AutoVacuumShmem->av_launcherpid = 0;
2896                 dlist_init(&AutoVacuumShmem->av_freeWorkers);
2897                 dlist_init(&AutoVacuumShmem->av_runningWorkers);
2898                 AutoVacuumShmem->av_startingWorker = NULL;
2899
2900                 worker = (WorkerInfo) ((char *) AutoVacuumShmem +
2901                                                            MAXALIGN(sizeof(AutoVacuumShmemStruct)));
2902
2903                 /* initialize the WorkerInfo free list */
2904                 for (i = 0; i < autovacuum_max_workers; i++)
2905                         dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
2906                                                         &worker[i].wi_links);
2907         }
2908         else
2909                 Assert(found);
2910 }
2911
2912 /*
2913  * autovac_refresh_stats
2914  *              Refresh pgstats data for an autovacuum process
2915  *
2916  * Cause the next pgstats read operation to obtain fresh data, but throttle
2917  * such refreshing in the autovacuum launcher.  This is mostly to avoid
2918  * rereading the pgstats files too many times in quick succession when there
2919  * are many databases.
2920  *
2921  * Note: we avoid throttling in the autovac worker, as it would be
2922  * counterproductive in the recheck logic.
2923  */
2924 static void
2925 autovac_refresh_stats(void)
2926 {
2927         if (IsAutoVacuumLauncherProcess())
2928         {
2929                 static TimestampTz last_read = 0;
2930                 TimestampTz current_time;
2931
2932                 current_time = GetCurrentTimestamp();
2933
2934                 if (!TimestampDifferenceExceeds(last_read, current_time,
2935                                                                                 STATS_READ_DELAY))
2936                         return;
2937
2938                 last_read = current_time;
2939         }
2940
2941         pgstat_clear_snapshot();
2942 }