]> granicus.if.org Git - postgresql/blob - src/backend/postmaster/autovacuum.c
Separate multixact freezing parameters from xid's
[postgresql] / src / backend / postmaster / autovacuum.c
1 /*-------------------------------------------------------------------------
2  *
3  * autovacuum.c
4  *
5  * PostgreSQL Integrated Autovacuum Daemon
6  *
7  * The autovacuum system is structured in two different kinds of processes: the
8  * autovacuum launcher and the autovacuum worker.  The launcher is an
9  * always-running process, started by the postmaster when the autovacuum GUC
10  * parameter is set.  The launcher schedules autovacuum workers to be started
11  * when appropriate.  The workers are the processes which execute the actual
12  * vacuuming; they connect to a database as determined in the launcher, and
13  * once connected they examine the catalogs to select the tables to vacuum.
14  *
15  * The autovacuum launcher cannot start the worker processes by itself,
16  * because doing so would cause robustness issues (namely, failure to shut
17  * them down on exceptional conditions, and also, since the launcher is
18  * connected to shared memory and is thus subject to corruption there, it is
19  * not as robust as the postmaster).  So it leaves that task to the postmaster.
20  *
21  * There is an autovacuum shared memory area, where the launcher stores
22  * information about the database it wants vacuumed.  When it wants a new
23  * worker to start, it sets a flag in shared memory and sends a signal to the
24  * postmaster.  Then postmaster knows nothing more than it must start a worker;
25  * so it forks a new child, which turns into a worker.  This new process
26  * connects to shared memory, and there it can inspect the information that the
27  * launcher has set up.
28  *
29  * If the fork() call fails in the postmaster, it sets a flag in the shared
30  * memory area, and sends a signal to the launcher.  The launcher, upon
31  * noticing the flag, can try starting the worker again by resending the
32  * signal.      Note that the failure can only be transient (fork failure due to
33  * high load, memory pressure, too many processes, etc); more permanent
34  * problems, like failure to connect to a database, are detected later in the
35  * worker and dealt with just by having the worker exit normally.  The launcher
36  * will launch a new worker again later, per schedule.
37  *
38  * When the worker is done vacuuming it sends SIGUSR2 to the launcher.  The
39  * launcher then wakes up and is able to launch another worker, if the schedule
40  * is so tight that a new worker is needed immediately.  At this time the
41  * launcher can also balance the settings for the various remaining workers'
42  * cost-based vacuum delay feature.
43  *
44  * Note that there can be more than one worker in a database concurrently.
45  * They will store the table they are currently vacuuming in shared memory, so
46  * that other workers avoid being blocked waiting for the vacuum lock for that
47  * table.  They will also reload the pgstats data just before vacuuming each
48  * table, to avoid vacuuming a table that was just finished being vacuumed by
49  * another worker and thus is no longer noted in shared memory.  However,
50  * there is a window (caused by pgstat delay) on which a worker may choose a
51  * table that was already vacuumed; this is a bug in the current design.
52  *
53  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
54  * Portions Copyright (c) 1994, Regents of the University of California
55  *
56  *
57  * IDENTIFICATION
58  *        src/backend/postmaster/autovacuum.c
59  *
60  *-------------------------------------------------------------------------
61  */
62 #include "postgres.h"
63
64 #include <signal.h>
65 #include <sys/types.h>
66 #include <sys/time.h>
67 #include <time.h>
68 #include <unistd.h>
69
70 #include "access/heapam.h"
71 #include "access/htup_details.h"
72 #include "access/multixact.h"
73 #include "access/reloptions.h"
74 #include "access/transam.h"
75 #include "access/xact.h"
76 #include "catalog/dependency.h"
77 #include "catalog/namespace.h"
78 #include "catalog/pg_database.h"
79 #include "commands/dbcommands.h"
80 #include "commands/vacuum.h"
81 #include "lib/ilist.h"
82 #include "libpq/pqsignal.h"
83 #include "miscadmin.h"
84 #include "pgstat.h"
85 #include "postmaster/autovacuum.h"
86 #include "postmaster/fork_process.h"
87 #include "postmaster/postmaster.h"
88 #include "storage/bufmgr.h"
89 #include "storage/ipc.h"
90 #include "storage/latch.h"
91 #include "storage/pmsignal.h"
92 #include "storage/proc.h"
93 #include "storage/procsignal.h"
94 #include "storage/sinvaladt.h"
95 #include "tcop/tcopprot.h"
96 #include "utils/fmgroids.h"
97 #include "utils/lsyscache.h"
98 #include "utils/memutils.h"
99 #include "utils/ps_status.h"
100 #include "utils/rel.h"
101 #include "utils/snapmgr.h"
102 #include "utils/syscache.h"
103 #include "utils/timeout.h"
104 #include "utils/timestamp.h"
105 #include "utils/tqual.h"
106
107
108 /*
109  * GUC parameters
110  */
111 bool            autovacuum_start_daemon = false;
112 int                     autovacuum_max_workers;
113 int                     autovacuum_work_mem = -1;
114 int                     autovacuum_naptime;
115 int                     autovacuum_vac_thresh;
116 double          autovacuum_vac_scale;
117 int                     autovacuum_anl_thresh;
118 double          autovacuum_anl_scale;
119 int                     autovacuum_freeze_max_age;
120 int                     autovacuum_multixact_freeze_max_age;
121
122 int                     autovacuum_vac_cost_delay;
123 int                     autovacuum_vac_cost_limit;
124
125 int                     Log_autovacuum_min_duration = -1;
126
127 /* how long to keep pgstat data in the launcher, in milliseconds */
128 #define STATS_READ_DELAY 1000
129
130 /* the minimum allowed time between two awakenings of the launcher */
131 #define MIN_AUTOVAC_SLEEPTIME 100.0             /* milliseconds */
132
133 /* Flags to tell if we are in an autovacuum process */
134 static bool am_autovacuum_launcher = false;
135 static bool am_autovacuum_worker = false;
136
137 /* Flags set by signal handlers */
138 static volatile sig_atomic_t got_SIGHUP = false;
139 static volatile sig_atomic_t got_SIGUSR2 = false;
140 static volatile sig_atomic_t got_SIGTERM = false;
141
142 /* Comparison points for determining whether freeze_max_age is exceeded */
143 static TransactionId recentXid;
144 static MultiXactId recentMulti;
145
146 /* Default freeze ages to use for autovacuum (varies by database) */
147 static int      default_freeze_min_age;
148 static int      default_freeze_table_age;
149 static int      default_multixact_freeze_min_age;
150 static int      default_multixact_freeze_table_age;
151
152 /* Memory context for long-lived data */
153 static MemoryContext AutovacMemCxt;
154
155 /* struct to keep track of databases in launcher */
156 typedef struct avl_dbase
157 {
158         Oid                     adl_datid;              /* hash key -- must be first */
159         TimestampTz adl_next_worker;
160         int                     adl_score;
161         dlist_node      adl_node;
162 } avl_dbase;
163
164 /* struct to keep track of databases in worker */
165 typedef struct avw_dbase
166 {
167         Oid                     adw_datid;
168         char       *adw_name;
169         TransactionId adw_frozenxid;
170         MultiXactId adw_minmulti;
171         PgStat_StatDBEntry *adw_entry;
172 } avw_dbase;
173
174 /* struct to keep track of tables to vacuum and/or analyze, in 1st pass */
175 typedef struct av_relation
176 {
177         Oid                     ar_toastrelid;  /* hash key - must be first */
178         Oid                     ar_relid;
179         bool            ar_hasrelopts;
180         AutoVacOpts ar_reloptions;      /* copy of AutoVacOpts from the main table's
181                                                                  * reloptions, or NULL if none */
182 } av_relation;
183
184 /* struct to keep track of tables to vacuum and/or analyze, after rechecking */
185 typedef struct autovac_table
186 {
187         Oid                     at_relid;
188         bool            at_dovacuum;
189         bool            at_doanalyze;
190         int                     at_freeze_min_age;
191         int                     at_freeze_table_age;
192         int                     at_multixact_freeze_min_age;
193         int                     at_multixact_freeze_table_age;
194         int                     at_vacuum_cost_delay;
195         int                     at_vacuum_cost_limit;
196         bool            at_wraparound;
197         char       *at_relname;
198         char       *at_nspname;
199         char       *at_datname;
200 } autovac_table;
201
202 /*-------------
203  * This struct holds information about a single worker's whereabouts.  We keep
204  * an array of these in shared memory, sized according to
205  * autovacuum_max_workers.
206  *
207  * wi_links             entry into free list or running list
208  * wi_dboid             OID of the database this worker is supposed to work on
209  * wi_tableoid  OID of the table currently being vacuumed, if any
210  * wi_proc              pointer to PGPROC of the running worker, NULL if not started
211  * wi_launchtime Time at which this worker was launched
212  * wi_cost_*    Vacuum cost-based delay parameters current in this worker
213  *
214  * All fields are protected by AutovacuumLock, except for wi_tableoid which is
215  * protected by AutovacuumScheduleLock (which is read-only for everyone except
216  * that worker itself).
217  *-------------
218  */
219 typedef struct WorkerInfoData
220 {
221         dlist_node      wi_links;
222         Oid                     wi_dboid;
223         Oid                     wi_tableoid;
224         PGPROC     *wi_proc;
225         TimestampTz wi_launchtime;
226         int                     wi_cost_delay;
227         int                     wi_cost_limit;
228         int                     wi_cost_limit_base;
229 } WorkerInfoData;
230
231 typedef struct WorkerInfoData *WorkerInfo;
232
233 /*
234  * Possible signals received by the launcher from remote processes.  These are
235  * stored atomically in shared memory so that other processes can set them
236  * without locking.
237  */
238 typedef enum
239 {
240         AutoVacForkFailed,                      /* failed trying to start a worker */
241         AutoVacRebalance,                       /* rebalance the cost limits */
242         AutoVacNumSignals                       /* must be last */
243 }       AutoVacuumSignal;
244
245 /*-------------
246  * The main autovacuum shmem struct.  On shared memory we store this main
247  * struct and the array of WorkerInfo structs.  This struct keeps:
248  *
249  * av_signal            set by other processes to indicate various conditions
250  * av_launcherpid       the PID of the autovacuum launcher
251  * av_freeWorkers       the WorkerInfo freelist
252  * av_runningWorkers the WorkerInfo non-free queue
253  * av_startingWorker pointer to WorkerInfo currently being started (cleared by
254  *                                      the worker itself as soon as it's up and running)
255  *
256  * This struct is protected by AutovacuumLock, except for av_signal and parts
257  * of the worker list (see above).
258  *-------------
259  */
260 typedef struct
261 {
262         sig_atomic_t av_signal[AutoVacNumSignals];
263         pid_t           av_launcherpid;
264         dlist_head      av_freeWorkers;
265         dlist_head      av_runningWorkers;
266         WorkerInfo      av_startingWorker;
267 } AutoVacuumShmemStruct;
268
269 static AutoVacuumShmemStruct *AutoVacuumShmem;
270
271 /*
272  * the database list (of avl_dbase elements) in the launcher, and the context
273  * that contains it
274  */
275 static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList);
276 static MemoryContext DatabaseListCxt = NULL;
277
278 /* Pointer to my own WorkerInfo, valid on each worker */
279 static WorkerInfo MyWorkerInfo = NULL;
280
281 /* PID of launcher, valid only in worker while shutting down */
282 int                     AutovacuumLauncherPid = 0;
283
284 #ifdef EXEC_BACKEND
285 static pid_t avlauncher_forkexec(void);
286 static pid_t avworker_forkexec(void);
287 #endif
288 NON_EXEC_STATIC void AutoVacWorkerMain(int argc, char *argv[]) __attribute__((noreturn));
289 NON_EXEC_STATIC void AutoVacLauncherMain(int argc, char *argv[]) __attribute__((noreturn));
290
291 static Oid      do_start_worker(void);
292 static void launcher_determine_sleep(bool canlaunch, bool recursing,
293                                                  struct timeval * nap);
294 static void launch_worker(TimestampTz now);
295 static List *get_database_list(void);
296 static void rebuild_database_list(Oid newdb);
297 static int      db_comparator(const void *a, const void *b);
298 static void autovac_balance_cost(void);
299
300 static void do_autovacuum(void);
301 static void FreeWorkerInfo(int code, Datum arg);
302
303 static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map,
304                                           TupleDesc pg_class_desc);
305 static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts,
306                                                   Form_pg_class classForm,
307                                                   PgStat_StatTabEntry *tabentry,
308                                                   bool *dovacuum, bool *doanalyze, bool *wraparound);
309
310 static void autovacuum_do_vac_analyze(autovac_table *tab,
311                                                   BufferAccessStrategy bstrategy);
312 static AutoVacOpts *extract_autovac_opts(HeapTuple tup,
313                                          TupleDesc pg_class_desc);
314 static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared,
315                                                   PgStat_StatDBEntry *shared,
316                                                   PgStat_StatDBEntry *dbentry);
317 static void autovac_report_activity(autovac_table *tab);
318 static void avl_sighup_handler(SIGNAL_ARGS);
319 static void avl_sigusr2_handler(SIGNAL_ARGS);
320 static void avl_sigterm_handler(SIGNAL_ARGS);
321 static void autovac_refresh_stats(void);
322
323
324
325 /********************************************************************
326  *                                        AUTOVACUUM LAUNCHER CODE
327  ********************************************************************/
328
329 #ifdef EXEC_BACKEND
330 /*
331  * forkexec routine for the autovacuum launcher process.
332  *
333  * Format up the arglist, then fork and exec.
334  */
335 static pid_t
336 avlauncher_forkexec(void)
337 {
338         char       *av[10];
339         int                     ac = 0;
340
341         av[ac++] = "postgres";
342         av[ac++] = "--forkavlauncher";
343         av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
344         av[ac] = NULL;
345
346         Assert(ac < lengthof(av));
347
348         return postmaster_forkexec(ac, av);
349 }
350
351 /*
352  * We need this set from the outside, before InitProcess is called
353  */
354 void
355 AutovacuumLauncherIAm(void)
356 {
357         am_autovacuum_launcher = true;
358 }
359 #endif
360
361 /*
362  * Main entry point for autovacuum launcher process, to be called from the
363  * postmaster.
364  */
365 int
366 StartAutoVacLauncher(void)
367 {
368         pid_t           AutoVacPID;
369
370 #ifdef EXEC_BACKEND
371         switch ((AutoVacPID = avlauncher_forkexec()))
372 #else
373         switch ((AutoVacPID = fork_process()))
374 #endif
375         {
376                 case -1:
377                         ereport(LOG,
378                                  (errmsg("could not fork autovacuum launcher process: %m")));
379                         return 0;
380
381 #ifndef EXEC_BACKEND
382                 case 0:
383                         /* in postmaster child ... */
384                         /* Close the postmaster's sockets */
385                         ClosePostmasterPorts(false);
386
387                         /* Lose the postmaster's on-exit routines */
388                         on_exit_reset();
389
390                         AutoVacLauncherMain(0, NULL);
391                         break;
392 #endif
393                 default:
394                         return (int) AutoVacPID;
395         }
396
397         /* shouldn't get here */
398         return 0;
399 }
400
401 /*
402  * Main loop for the autovacuum launcher process.
403  */
404 NON_EXEC_STATIC void
405 AutoVacLauncherMain(int argc, char *argv[])
406 {
407         sigjmp_buf      local_sigjmp_buf;
408
409         /* we are a postmaster subprocess now */
410         IsUnderPostmaster = true;
411         am_autovacuum_launcher = true;
412
413         /* reset MyProcPid */
414         MyProcPid = getpid();
415
416         /* record Start Time for logging */
417         MyStartTime = time(NULL);
418
419         /* Identify myself via ps */
420         init_ps_display("autovacuum launcher process", "", "", "");
421
422         ereport(LOG,
423                         (errmsg("autovacuum launcher started")));
424
425         if (PostAuthDelay)
426                 pg_usleep(PostAuthDelay * 1000000L);
427
428         SetProcessingMode(InitProcessing);
429
430         /*
431          * If possible, make this process a group leader, so that the postmaster
432          * can signal any child processes too.  (autovacuum probably never has any
433          * child processes, but for consistency we make all postmaster child
434          * processes do this.)
435          */
436 #ifdef HAVE_SETSID
437         if (setsid() < 0)
438                 elog(FATAL, "setsid() failed: %m");
439 #endif
440
441         /*
442          * Set up signal handlers.      We operate on databases much like a regular
443          * backend, so we use the same signal handling.  See equivalent code in
444          * tcop/postgres.c.
445          */
446         pqsignal(SIGHUP, avl_sighup_handler);
447         pqsignal(SIGINT, StatementCancelHandler);
448         pqsignal(SIGTERM, avl_sigterm_handler);
449
450         pqsignal(SIGQUIT, quickdie);
451         InitializeTimeouts();           /* establishes SIGALRM handler */
452
453         pqsignal(SIGPIPE, SIG_IGN);
454         pqsignal(SIGUSR1, procsignal_sigusr1_handler);
455         pqsignal(SIGUSR2, avl_sigusr2_handler);
456         pqsignal(SIGFPE, FloatExceptionHandler);
457         pqsignal(SIGCHLD, SIG_DFL);
458
459         /* Early initialization */
460         BaseInit();
461
462         /*
463          * Create a per-backend PGPROC struct in shared memory, except in the
464          * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
465          * this before we can use LWLocks (and in the EXEC_BACKEND case we already
466          * had to do some stuff with LWLocks).
467          */
468 #ifndef EXEC_BACKEND
469         InitProcess();
470 #endif
471
472         InitPostgres(NULL, InvalidOid, NULL, NULL);
473
474         SetProcessingMode(NormalProcessing);
475
476         /*
477          * Create a memory context that we will do all our work in.  We do this so
478          * that we can reset the context during error recovery and thereby avoid
479          * possible memory leaks.
480          */
481         AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
482                                                                                   "Autovacuum Launcher",
483                                                                                   ALLOCSET_DEFAULT_MINSIZE,
484                                                                                   ALLOCSET_DEFAULT_INITSIZE,
485                                                                                   ALLOCSET_DEFAULT_MAXSIZE);
486         MemoryContextSwitchTo(AutovacMemCxt);
487
488         /*
489          * If an exception is encountered, processing resumes here.
490          *
491          * This code is a stripped down version of PostgresMain error recovery.
492          */
493         if (sigsetjmp(local_sigjmp_buf, 1) != 0)
494         {
495                 /* since not using PG_TRY, must reset error stack by hand */
496                 error_context_stack = NULL;
497
498                 /* Prevents interrupts while cleaning up */
499                 HOLD_INTERRUPTS();
500
501                 /* Forget any pending QueryCancel or timeout request */
502                 disable_all_timeouts(false);
503                 QueryCancelPending = false;             /* second to avoid race condition */
504
505                 /* Report the error to the server log */
506                 EmitErrorReport();
507
508                 /* Abort the current transaction in order to recover */
509                 AbortCurrentTransaction();
510
511                 /*
512                  * Now return to normal top-level context and clear ErrorContext for
513                  * next time.
514                  */
515                 MemoryContextSwitchTo(AutovacMemCxt);
516                 FlushErrorState();
517
518                 /* Flush any leaked data in the top-level context */
519                 MemoryContextResetAndDeleteChildren(AutovacMemCxt);
520
521                 /* don't leave dangling pointers to freed memory */
522                 DatabaseListCxt = NULL;
523                 dlist_init(&DatabaseList);
524
525                 /*
526                  * Make sure pgstat also considers our stat data as gone.  Note: we
527                  * mustn't use autovac_refresh_stats here.
528                  */
529                 pgstat_clear_snapshot();
530
531                 /* Now we can allow interrupts again */
532                 RESUME_INTERRUPTS();
533
534                 /*
535                  * Sleep at least 1 second after any error.  We don't want to be
536                  * filling the error logs as fast as we can.
537                  */
538                 pg_usleep(1000000L);
539         }
540
541         /* We can now handle ereport(ERROR) */
542         PG_exception_stack = &local_sigjmp_buf;
543
544         /* must unblock signals before calling rebuild_database_list */
545         PG_SETMASK(&UnBlockSig);
546
547         /*
548          * Force zero_damaged_pages OFF in the autovac process, even if it is set
549          * in postgresql.conf.  We don't really want such a dangerous option being
550          * applied non-interactively.
551          */
552         SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
553
554         /*
555          * Force statement_timeout and lock_timeout to zero to avoid letting these
556          * settings prevent regular maintenance from being executed.
557          */
558         SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
559         SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
560
561         /*
562          * Force default_transaction_isolation to READ COMMITTED.  We don't want
563          * to pay the overhead of serializable mode, nor add any risk of causing
564          * deadlocks or delaying other transactions.
565          */
566         SetConfigOption("default_transaction_isolation", "read committed",
567                                         PGC_SUSET, PGC_S_OVERRIDE);
568
569         /* in emergency mode, just start a worker and go away */
570         if (!AutoVacuumingActive())
571         {
572                 do_start_worker();
573                 proc_exit(0);                   /* done */
574         }
575
576         AutoVacuumShmem->av_launcherpid = MyProcPid;
577
578         /*
579          * Create the initial database list.  The invariant we want this list to
580          * keep is that it's ordered by decreasing next_time.  As soon as an entry
581          * is updated to a higher time, it will be moved to the front (which is
582          * correct because the only operation is to add autovacuum_naptime to the
583          * entry, and time always increases).
584          */
585         rebuild_database_list(InvalidOid);
586
587         for (;;)
588         {
589                 struct timeval nap;
590                 TimestampTz current_time = 0;
591                 bool            can_launch;
592                 int                     rc;
593
594                 /*
595                  * This loop is a bit different from the normal use of WaitLatch,
596                  * because we'd like to sleep before the first launch of a child
597                  * process.  So it's WaitLatch, then ResetLatch, then check for
598                  * wakening conditions.
599                  */
600
601                 launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers),
602                                                                  false, &nap);
603
604                 /* Allow sinval catchup interrupts while sleeping */
605                 EnableCatchupInterrupt();
606
607                 /*
608                  * Wait until naptime expires or we get some type of signal (all the
609                  * signal handlers will wake us by calling SetLatch).
610                  */
611                 rc = WaitLatch(&MyProc->procLatch,
612                                            WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
613                                            (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L));
614
615                 ResetLatch(&MyProc->procLatch);
616
617                 DisableCatchupInterrupt();
618
619                 /*
620                  * Emergency bailout if postmaster has died.  This is to avoid the
621                  * necessity for manual cleanup of all postmaster children.
622                  */
623                 if (rc & WL_POSTMASTER_DEATH)
624                         proc_exit(1);
625
626                 /* the normal shutdown case */
627                 if (got_SIGTERM)
628                         break;
629
630                 if (got_SIGHUP)
631                 {
632                         got_SIGHUP = false;
633                         ProcessConfigFile(PGC_SIGHUP);
634
635                         /* shutdown requested in config file? */
636                         if (!AutoVacuumingActive())
637                                 break;
638
639                         /* rebalance in case the default cost parameters changed */
640                         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
641                         autovac_balance_cost();
642                         LWLockRelease(AutovacuumLock);
643
644                         /* rebuild the list in case the naptime changed */
645                         rebuild_database_list(InvalidOid);
646                 }
647
648                 /*
649                  * a worker finished, or postmaster signalled failure to start a
650                  * worker
651                  */
652                 if (got_SIGUSR2)
653                 {
654                         got_SIGUSR2 = false;
655
656                         /* rebalance cost limits, if needed */
657                         if (AutoVacuumShmem->av_signal[AutoVacRebalance])
658                         {
659                                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
660                                 AutoVacuumShmem->av_signal[AutoVacRebalance] = false;
661                                 autovac_balance_cost();
662                                 LWLockRelease(AutovacuumLock);
663                         }
664
665                         if (AutoVacuumShmem->av_signal[AutoVacForkFailed])
666                         {
667                                 /*
668                                  * If the postmaster failed to start a new worker, we sleep
669                                  * for a little while and resend the signal.  The new worker's
670                                  * state is still in memory, so this is sufficient.  After
671                                  * that, we restart the main loop.
672                                  *
673                                  * XXX should we put a limit to the number of times we retry?
674                                  * I don't think it makes much sense, because a future start
675                                  * of a worker will continue to fail in the same way.
676                                  */
677                                 AutoVacuumShmem->av_signal[AutoVacForkFailed] = false;
678                                 pg_usleep(1000000L);    /* 1s */
679                                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
680                                 continue;
681                         }
682                 }
683
684                 /*
685                  * There are some conditions that we need to check before trying to
686                  * start a launcher.  First, we need to make sure that there is a
687                  * launcher slot available.  Second, we need to make sure that no
688                  * other worker failed while starting up.
689                  */
690
691                 current_time = GetCurrentTimestamp();
692                 LWLockAcquire(AutovacuumLock, LW_SHARED);
693
694                 can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers);
695
696                 if (AutoVacuumShmem->av_startingWorker != NULL)
697                 {
698                         int                     waittime;
699                         WorkerInfo      worker = AutoVacuumShmem->av_startingWorker;
700
701                         /*
702                          * We can't launch another worker when another one is still
703                          * starting up (or failed while doing so), so just sleep for a bit
704                          * more; that worker will wake us up again as soon as it's ready.
705                          * We will only wait autovacuum_naptime seconds (up to a maximum
706                          * of 60 seconds) for this to happen however.  Note that failure
707                          * to connect to a particular database is not a problem here,
708                          * because the worker removes itself from the startingWorker
709                          * pointer before trying to connect.  Problems detected by the
710                          * postmaster (like fork() failure) are also reported and handled
711                          * differently.  The only problems that may cause this code to
712                          * fire are errors in the earlier sections of AutoVacWorkerMain,
713                          * before the worker removes the WorkerInfo from the
714                          * startingWorker pointer.
715                          */
716                         waittime = Min(autovacuum_naptime, 60) * 1000;
717                         if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time,
718                                                                                    waittime))
719                         {
720                                 LWLockRelease(AutovacuumLock);
721                                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
722
723                                 /*
724                                  * No other process can put a worker in starting mode, so if
725                                  * startingWorker is still INVALID after exchanging our lock,
726                                  * we assume it's the same one we saw above (so we don't
727                                  * recheck the launch time).
728                                  */
729                                 if (AutoVacuumShmem->av_startingWorker != NULL)
730                                 {
731                                         worker = AutoVacuumShmem->av_startingWorker;
732                                         worker->wi_dboid = InvalidOid;
733                                         worker->wi_tableoid = InvalidOid;
734                                         worker->wi_proc = NULL;
735                                         worker->wi_launchtime = 0;
736                                         dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
737                                                                         &worker->wi_links);
738                                         AutoVacuumShmem->av_startingWorker = NULL;
739                                         elog(WARNING, "worker took too long to start; canceled");
740                                 }
741                         }
742                         else
743                                 can_launch = false;
744                 }
745                 LWLockRelease(AutovacuumLock);  /* either shared or exclusive */
746
747                 /* if we can't do anything, just go back to sleep */
748                 if (!can_launch)
749                         continue;
750
751                 /* We're OK to start a new worker */
752
753                 if (dlist_is_empty(&DatabaseList))
754                 {
755                         /*
756                          * Special case when the list is empty: start a worker right away.
757                          * This covers the initial case, when no database is in pgstats
758                          * (thus the list is empty).  Note that the constraints in
759                          * launcher_determine_sleep keep us from starting workers too
760                          * quickly (at most once every autovacuum_naptime when the list is
761                          * empty).
762                          */
763                         launch_worker(current_time);
764                 }
765                 else
766                 {
767                         /*
768                          * because rebuild_database_list constructs a list with most
769                          * distant adl_next_worker first, we obtain our database from the
770                          * tail of the list.
771                          */
772                         avl_dbase  *avdb;
773
774                         avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
775
776                         /*
777                          * launch a worker if next_worker is right now or it is in the
778                          * past
779                          */
780                         if (TimestampDifferenceExceeds(avdb->adl_next_worker,
781                                                                                    current_time, 0))
782                                 launch_worker(current_time);
783                 }
784         }
785
786         /* Normal exit from the autovac launcher is here */
787         ereport(LOG,
788                         (errmsg("autovacuum launcher shutting down")));
789         AutoVacuumShmem->av_launcherpid = 0;
790
791         proc_exit(0);                           /* done */
792 }
793
794 /*
795  * Determine the time to sleep, based on the database list.
796  *
797  * The "canlaunch" parameter indicates whether we can start a worker right now,
798  * for example due to the workers being all busy.  If this is false, we will
799  * cause a long sleep, which will be interrupted when a worker exits.
800  */
801 static void
802 launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval * nap)
803 {
804         /*
805          * We sleep until the next scheduled vacuum.  We trust that when the
806          * database list was built, care was taken so that no entries have times
807          * in the past; if the first entry has too close a next_worker value, or a
808          * time in the past, we will sleep a small nominal time.
809          */
810         if (!canlaunch)
811         {
812                 nap->tv_sec = autovacuum_naptime;
813                 nap->tv_usec = 0;
814         }
815         else if (!dlist_is_empty(&DatabaseList))
816         {
817                 TimestampTz current_time = GetCurrentTimestamp();
818                 TimestampTz next_wakeup;
819                 avl_dbase  *avdb;
820                 long            secs;
821                 int                     usecs;
822
823                 avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
824
825                 next_wakeup = avdb->adl_next_worker;
826                 TimestampDifference(current_time, next_wakeup, &secs, &usecs);
827
828                 nap->tv_sec = secs;
829                 nap->tv_usec = usecs;
830         }
831         else
832         {
833                 /* list is empty, sleep for whole autovacuum_naptime seconds  */
834                 nap->tv_sec = autovacuum_naptime;
835                 nap->tv_usec = 0;
836         }
837
838         /*
839          * If the result is exactly zero, it means a database had an entry with
840          * time in the past.  Rebuild the list so that the databases are evenly
841          * distributed again, and recalculate the time to sleep.  This can happen
842          * if there are more tables needing vacuum than workers, and they all take
843          * longer to vacuum than autovacuum_naptime.
844          *
845          * We only recurse once.  rebuild_database_list should always return times
846          * in the future, but it seems best not to trust too much on that.
847          */
848         if (nap->tv_sec == 0 && nap->tv_usec == 0 && !recursing)
849         {
850                 rebuild_database_list(InvalidOid);
851                 launcher_determine_sleep(canlaunch, true, nap);
852                 return;
853         }
854
855         /* The smallest time we'll allow the launcher to sleep. */
856         if (nap->tv_sec <= 0 && nap->tv_usec <= MIN_AUTOVAC_SLEEPTIME * 1000)
857         {
858                 nap->tv_sec = 0;
859                 nap->tv_usec = MIN_AUTOVAC_SLEEPTIME * 1000;
860         }
861 }
862
863 /*
864  * Build an updated DatabaseList.  It must only contain databases that appear
865  * in pgstats, and must be sorted by next_worker from highest to lowest,
866  * distributed regularly across the next autovacuum_naptime interval.
867  *
868  * Receives the Oid of the database that made this list be generated (we call
869  * this the "new" database, because when the database was already present on
870  * the list, we expect that this function is not called at all).  The
871  * preexisting list, if any, will be used to preserve the order of the
872  * databases in the autovacuum_naptime period.  The new database is put at the
873  * end of the interval.  The actual values are not saved, which should not be
874  * much of a problem.
875  */
876 static void
877 rebuild_database_list(Oid newdb)
878 {
879         List       *dblist;
880         ListCell   *cell;
881         MemoryContext newcxt;
882         MemoryContext oldcxt;
883         MemoryContext tmpcxt;
884         HASHCTL         hctl;
885         int                     score;
886         int                     nelems;
887         HTAB       *dbhash;
888         dlist_iter      iter;
889
890         /* use fresh stats */
891         autovac_refresh_stats();
892
893         newcxt = AllocSetContextCreate(AutovacMemCxt,
894                                                                    "AV dblist",
895                                                                    ALLOCSET_DEFAULT_MINSIZE,
896                                                                    ALLOCSET_DEFAULT_INITSIZE,
897                                                                    ALLOCSET_DEFAULT_MAXSIZE);
898         tmpcxt = AllocSetContextCreate(newcxt,
899                                                                    "tmp AV dblist",
900                                                                    ALLOCSET_DEFAULT_MINSIZE,
901                                                                    ALLOCSET_DEFAULT_INITSIZE,
902                                                                    ALLOCSET_DEFAULT_MAXSIZE);
903         oldcxt = MemoryContextSwitchTo(tmpcxt);
904
905         /*
906          * Implementing this is not as simple as it sounds, because we need to put
907          * the new database at the end of the list; next the databases that were
908          * already on the list, and finally (at the tail of the list) all the
909          * other databases that are not on the existing list.
910          *
911          * To do this, we build an empty hash table of scored databases.  We will
912          * start with the lowest score (zero) for the new database, then
913          * increasing scores for the databases in the existing list, in order, and
914          * lastly increasing scores for all databases gotten via
915          * get_database_list() that are not already on the hash.
916          *
917          * Then we will put all the hash elements into an array, sort the array by
918          * score, and finally put the array elements into the new doubly linked
919          * list.
920          */
921         hctl.keysize = sizeof(Oid);
922         hctl.entrysize = sizeof(avl_dbase);
923         hctl.hash = oid_hash;
924         hctl.hcxt = tmpcxt;
925         dbhash = hash_create("db hash", 20, &hctl,      /* magic number here FIXME */
926                                                  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
927
928         /* start by inserting the new database */
929         score = 0;
930         if (OidIsValid(newdb))
931         {
932                 avl_dbase  *db;
933                 PgStat_StatDBEntry *entry;
934
935                 /* only consider this database if it has a pgstat entry */
936                 entry = pgstat_fetch_stat_dbentry(newdb);
937                 if (entry != NULL)
938                 {
939                         /* we assume it isn't found because the hash was just created */
940                         db = hash_search(dbhash, &newdb, HASH_ENTER, NULL);
941
942                         /* hash_search already filled in the key */
943                         db->adl_score = score++;
944                         /* next_worker is filled in later */
945                 }
946         }
947
948         /* Now insert the databases from the existing list */
949         dlist_foreach(iter, &DatabaseList)
950         {
951                 avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
952                 avl_dbase  *db;
953                 bool            found;
954                 PgStat_StatDBEntry *entry;
955
956                 /*
957                  * skip databases with no stat entries -- in particular, this gets rid
958                  * of dropped databases
959                  */
960                 entry = pgstat_fetch_stat_dbentry(avdb->adl_datid);
961                 if (entry == NULL)
962                         continue;
963
964                 db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found);
965
966                 if (!found)
967                 {
968                         /* hash_search already filled in the key */
969                         db->adl_score = score++;
970                         /* next_worker is filled in later */
971                 }
972         }
973
974         /* finally, insert all qualifying databases not previously inserted */
975         dblist = get_database_list();
976         foreach(cell, dblist)
977         {
978                 avw_dbase  *avdb = lfirst(cell);
979                 avl_dbase  *db;
980                 bool            found;
981                 PgStat_StatDBEntry *entry;
982
983                 /* only consider databases with a pgstat entry */
984                 entry = pgstat_fetch_stat_dbentry(avdb->adw_datid);
985                 if (entry == NULL)
986                         continue;
987
988                 db = hash_search(dbhash, &(avdb->adw_datid), HASH_ENTER, &found);
989                 /* only update the score if the database was not already on the hash */
990                 if (!found)
991                 {
992                         /* hash_search already filled in the key */
993                         db->adl_score = score++;
994                         /* next_worker is filled in later */
995                 }
996         }
997         nelems = score;
998
999         /* from here on, the allocated memory belongs to the new list */
1000         MemoryContextSwitchTo(newcxt);
1001         dlist_init(&DatabaseList);
1002
1003         if (nelems > 0)
1004         {
1005                 TimestampTz current_time;
1006                 int                     millis_increment;
1007                 avl_dbase  *dbary;
1008                 avl_dbase  *db;
1009                 HASH_SEQ_STATUS seq;
1010                 int                     i;
1011
1012                 /* put all the hash elements into an array */
1013                 dbary = palloc(nelems * sizeof(avl_dbase));
1014
1015                 i = 0;
1016                 hash_seq_init(&seq, dbhash);
1017                 while ((db = hash_seq_search(&seq)) != NULL)
1018                         memcpy(&(dbary[i++]), db, sizeof(avl_dbase));
1019
1020                 /* sort the array */
1021                 qsort(dbary, nelems, sizeof(avl_dbase), db_comparator);
1022
1023                 /*
1024                  * Determine the time interval between databases in the schedule. If
1025                  * we see that the configured naptime would take us to sleep times
1026                  * lower than our min sleep time (which launcher_determine_sleep is
1027                  * coded not to allow), silently use a larger naptime (but don't touch
1028                  * the GUC variable).
1029                  */
1030                 millis_increment = 1000.0 * autovacuum_naptime / nelems;
1031                 if (millis_increment <= MIN_AUTOVAC_SLEEPTIME)
1032                         millis_increment = MIN_AUTOVAC_SLEEPTIME * 1.1;
1033
1034                 current_time = GetCurrentTimestamp();
1035
1036                 /*
1037                  * move the elements from the array into the dllist, setting the
1038                  * next_worker while walking the array
1039                  */
1040                 for (i = 0; i < nelems; i++)
1041                 {
1042                         avl_dbase  *db = &(dbary[i]);
1043
1044                         current_time = TimestampTzPlusMilliseconds(current_time,
1045                                                                                                            millis_increment);
1046                         db->adl_next_worker = current_time;
1047
1048                         /* later elements should go closer to the head of the list */
1049                         dlist_push_head(&DatabaseList, &db->adl_node);
1050                 }
1051         }
1052
1053         /* all done, clean up memory */
1054         if (DatabaseListCxt != NULL)
1055                 MemoryContextDelete(DatabaseListCxt);
1056         MemoryContextDelete(tmpcxt);
1057         DatabaseListCxt = newcxt;
1058         MemoryContextSwitchTo(oldcxt);
1059 }
1060
1061 /* qsort comparator for avl_dbase, using adl_score */
1062 static int
1063 db_comparator(const void *a, const void *b)
1064 {
1065         if (((const avl_dbase *) a)->adl_score == ((const avl_dbase *) b)->adl_score)
1066                 return 0;
1067         else
1068                 return (((const avl_dbase *) a)->adl_score < ((const avl_dbase *) b)->adl_score) ? 1 : -1;
1069 }
1070
1071 /*
1072  * do_start_worker
1073  *
1074  * Bare-bones procedure for starting an autovacuum worker from the launcher.
1075  * It determines what database to work on, sets up shared memory stuff and
1076  * signals postmaster to start the worker.      It fails gracefully if invoked when
1077  * autovacuum_workers are already active.
1078  *
1079  * Return value is the OID of the database that the worker is going to process,
1080  * or InvalidOid if no worker was actually started.
1081  */
1082 static Oid
1083 do_start_worker(void)
1084 {
1085         List       *dblist;
1086         ListCell   *cell;
1087         TransactionId xidForceLimit;
1088         MultiXactId multiForceLimit;
1089         bool            for_xid_wrap;
1090         bool            for_multi_wrap;
1091         avw_dbase  *avdb;
1092         TimestampTz current_time;
1093         bool            skipit = false;
1094         Oid                     retval = InvalidOid;
1095         MemoryContext tmpcxt,
1096                                 oldcxt;
1097
1098         /* return quickly when there are no free workers */
1099         LWLockAcquire(AutovacuumLock, LW_SHARED);
1100         if (dlist_is_empty(&AutoVacuumShmem->av_freeWorkers))
1101         {
1102                 LWLockRelease(AutovacuumLock);
1103                 return InvalidOid;
1104         }
1105         LWLockRelease(AutovacuumLock);
1106
1107         /*
1108          * Create and switch to a temporary context to avoid leaking the memory
1109          * allocated for the database list.
1110          */
1111         tmpcxt = AllocSetContextCreate(CurrentMemoryContext,
1112                                                                    "Start worker tmp cxt",
1113                                                                    ALLOCSET_DEFAULT_MINSIZE,
1114                                                                    ALLOCSET_DEFAULT_INITSIZE,
1115                                                                    ALLOCSET_DEFAULT_MAXSIZE);
1116         oldcxt = MemoryContextSwitchTo(tmpcxt);
1117
1118         /* use fresh stats */
1119         autovac_refresh_stats();
1120
1121         /* Get a list of databases */
1122         dblist = get_database_list();
1123
1124         /*
1125          * Determine the oldest datfrozenxid/relfrozenxid that we will allow to
1126          * pass without forcing a vacuum.  (This limit can be tightened for
1127          * particular tables, but not loosened.)
1128          */
1129         recentXid = ReadNewTransactionId();
1130         xidForceLimit = recentXid - autovacuum_freeze_max_age;
1131         /* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */
1132         /* this can cause the limit to go backwards by 3, but that's OK */
1133         if (xidForceLimit < FirstNormalTransactionId)
1134                 xidForceLimit -= FirstNormalTransactionId;
1135
1136         /* Also determine the oldest datminmxid we will consider. */
1137         recentMulti = ReadNextMultiXactId();
1138         multiForceLimit = recentMulti - autovacuum_multixact_freeze_max_age;
1139         if (multiForceLimit < FirstMultiXactId)
1140                 multiForceLimit -= FirstMultiXactId;
1141
1142         /*
1143          * Choose a database to connect to.  We pick the database that was least
1144          * recently auto-vacuumed, or one that needs vacuuming to prevent Xid
1145          * wraparound-related data loss.  If any db at risk of Xid wraparound is
1146          * found, we pick the one with oldest datfrozenxid, independently of
1147          * autovacuum times; similarly we pick the one with the oldest datminmxid
1148          * if any is in MultiXactId wraparound.  Note that those in Xid wraparound
1149          * danger are given more priority than those in multi wraparound danger.
1150          *
1151          * Note that a database with no stats entry is not considered, except for
1152          * Xid wraparound purposes.  The theory is that if no one has ever
1153          * connected to it since the stats were last initialized, it doesn't need
1154          * vacuuming.
1155          *
1156          * XXX This could be improved if we had more info about whether it needs
1157          * vacuuming before connecting to it.  Perhaps look through the pgstats
1158          * data for the database's tables?  One idea is to keep track of the
1159          * number of new and dead tuples per database in pgstats.  However it
1160          * isn't clear how to construct a metric that measures that and not cause
1161          * starvation for less busy databases.
1162          */
1163         avdb = NULL;
1164         for_xid_wrap = false;
1165         for_multi_wrap = false;
1166         current_time = GetCurrentTimestamp();
1167         foreach(cell, dblist)
1168         {
1169                 avw_dbase  *tmp = lfirst(cell);
1170                 dlist_iter      iter;
1171
1172                 /* Check to see if this one is at risk of wraparound */
1173                 if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit))
1174                 {
1175                         if (avdb == NULL ||
1176                                 TransactionIdPrecedes(tmp->adw_frozenxid,
1177                                                                           avdb->adw_frozenxid))
1178                                 avdb = tmp;
1179                         for_xid_wrap = true;
1180                         continue;
1181                 }
1182                 else if (for_xid_wrap)
1183                         continue;                       /* ignore not-at-risk DBs */
1184                 else if (MultiXactIdPrecedes(tmp->adw_minmulti, multiForceLimit))
1185                 {
1186                         if (avdb == NULL ||
1187                                 MultiXactIdPrecedes(tmp->adw_minmulti, avdb->adw_minmulti))
1188                                 avdb = tmp;
1189                         for_multi_wrap = true;
1190                         continue;
1191                 }
1192                 else if (for_multi_wrap)
1193                         continue;                       /* ignore not-at-risk DBs */
1194
1195                 /* Find pgstat entry if any */
1196                 tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid);
1197
1198                 /*
1199                  * Skip a database with no pgstat entry; it means it hasn't seen any
1200                  * activity.
1201                  */
1202                 if (!tmp->adw_entry)
1203                         continue;
1204
1205                 /*
1206                  * Also, skip a database that appears on the database list as having
1207                  * been processed recently (less than autovacuum_naptime seconds ago).
1208                  * We do this so that we don't select a database which we just
1209                  * selected, but that pgstat hasn't gotten around to updating the last
1210                  * autovacuum time yet.
1211                  */
1212                 skipit = false;
1213
1214                 dlist_reverse_foreach(iter, &DatabaseList)
1215                 {
1216                         avl_dbase  *dbp = dlist_container(avl_dbase, adl_node, iter.cur);
1217
1218                         if (dbp->adl_datid == tmp->adw_datid)
1219                         {
1220                                 /*
1221                                  * Skip this database if its next_worker value falls between
1222                                  * the current time and the current time plus naptime.
1223                                  */
1224                                 if (!TimestampDifferenceExceeds(dbp->adl_next_worker,
1225                                                                                                 current_time, 0) &&
1226                                         !TimestampDifferenceExceeds(current_time,
1227                                                                                                 dbp->adl_next_worker,
1228                                                                                                 autovacuum_naptime * 1000))
1229                                         skipit = true;
1230
1231                                 break;
1232                         }
1233                 }
1234                 if (skipit)
1235                         continue;
1236
1237                 /*
1238                  * Remember the db with oldest autovac time.  (If we are here, both
1239                  * tmp->entry and db->entry must be non-null.)
1240                  */
1241                 if (avdb == NULL ||
1242                         tmp->adw_entry->last_autovac_time < avdb->adw_entry->last_autovac_time)
1243                         avdb = tmp;
1244         }
1245
1246         /* Found a database -- process it */
1247         if (avdb != NULL)
1248         {
1249                 WorkerInfo      worker;
1250                 dlist_node *wptr;
1251
1252                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1253
1254                 /*
1255                  * Get a worker entry from the freelist.  We checked above, so there
1256                  * really should be a free slot.
1257                  */
1258                 wptr = dlist_pop_head_node(&AutoVacuumShmem->av_freeWorkers);
1259
1260                 worker = dlist_container(WorkerInfoData, wi_links, wptr);
1261                 worker->wi_dboid = avdb->adw_datid;
1262                 worker->wi_proc = NULL;
1263                 worker->wi_launchtime = GetCurrentTimestamp();
1264
1265                 AutoVacuumShmem->av_startingWorker = worker;
1266
1267                 LWLockRelease(AutovacuumLock);
1268
1269                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
1270
1271                 retval = avdb->adw_datid;
1272         }
1273         else if (skipit)
1274         {
1275                 /*
1276                  * If we skipped all databases on the list, rebuild it, because it
1277                  * probably contains a dropped database.
1278                  */
1279                 rebuild_database_list(InvalidOid);
1280         }
1281
1282         MemoryContextSwitchTo(oldcxt);
1283         MemoryContextDelete(tmpcxt);
1284
1285         return retval;
1286 }
1287
1288 /*
1289  * launch_worker
1290  *
1291  * Wrapper for starting a worker from the launcher.  Besides actually starting
1292  * it, update the database list to reflect the next time that another one will
1293  * need to be started on the selected database.  The actual database choice is
1294  * left to do_start_worker.
1295  *
1296  * This routine is also expected to insert an entry into the database list if
1297  * the selected database was previously absent from the list.
1298  */
1299 static void
1300 launch_worker(TimestampTz now)
1301 {
1302         Oid                     dbid;
1303         dlist_iter      iter;
1304
1305         dbid = do_start_worker();
1306         if (OidIsValid(dbid))
1307         {
1308                 bool            found = false;
1309
1310                 /*
1311                  * Walk the database list and update the corresponding entry.  If the
1312                  * database is not on the list, we'll recreate the list.
1313                  */
1314                 dlist_foreach(iter, &DatabaseList)
1315                 {
1316                         avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
1317
1318                         if (avdb->adl_datid == dbid)
1319                         {
1320                                 found = true;
1321
1322                                 /*
1323                                  * add autovacuum_naptime seconds to the current time, and use
1324                                  * that as the new "next_worker" field for this database.
1325                                  */
1326                                 avdb->adl_next_worker =
1327                                         TimestampTzPlusMilliseconds(now, autovacuum_naptime * 1000);
1328
1329                                 dlist_move_head(&DatabaseList, iter.cur);
1330                                 break;
1331                         }
1332                 }
1333
1334                 /*
1335                  * If the database was not present in the database list, we rebuild
1336                  * the list.  It's possible that the database does not get into the
1337                  * list anyway, for example if it's a database that doesn't have a
1338                  * pgstat entry, but this is not a problem because we don't want to
1339                  * schedule workers regularly into those in any case.
1340                  */
1341                 if (!found)
1342                         rebuild_database_list(dbid);
1343         }
1344 }
1345
1346 /*
1347  * Called from postmaster to signal a failure to fork a process to become
1348  * worker.      The postmaster should kill(SIGUSR2) the launcher shortly
1349  * after calling this function.
1350  */
1351 void
1352 AutoVacWorkerFailed(void)
1353 {
1354         AutoVacuumShmem->av_signal[AutoVacForkFailed] = true;
1355 }
1356
1357 /* SIGHUP: set flag to re-read config file at next convenient time */
1358 static void
1359 avl_sighup_handler(SIGNAL_ARGS)
1360 {
1361         int                     save_errno = errno;
1362
1363         got_SIGHUP = true;
1364         if (MyProc)
1365                 SetLatch(&MyProc->procLatch);
1366
1367         errno = save_errno;
1368 }
1369
1370 /* SIGUSR2: a worker is up and running, or just finished, or failed to fork */
1371 static void
1372 avl_sigusr2_handler(SIGNAL_ARGS)
1373 {
1374         int                     save_errno = errno;
1375
1376         got_SIGUSR2 = true;
1377         if (MyProc)
1378                 SetLatch(&MyProc->procLatch);
1379
1380         errno = save_errno;
1381 }
1382
1383 /* SIGTERM: time to die */
1384 static void
1385 avl_sigterm_handler(SIGNAL_ARGS)
1386 {
1387         int                     save_errno = errno;
1388
1389         got_SIGTERM = true;
1390         if (MyProc)
1391                 SetLatch(&MyProc->procLatch);
1392
1393         errno = save_errno;
1394 }
1395
1396
1397 /********************************************************************
1398  *                                        AUTOVACUUM WORKER CODE
1399  ********************************************************************/
1400
1401 #ifdef EXEC_BACKEND
1402 /*
1403  * forkexec routines for the autovacuum worker.
1404  *
1405  * Format up the arglist, then fork and exec.
1406  */
1407 static pid_t
1408 avworker_forkexec(void)
1409 {
1410         char       *av[10];
1411         int                     ac = 0;
1412
1413         av[ac++] = "postgres";
1414         av[ac++] = "--forkavworker";
1415         av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
1416         av[ac] = NULL;
1417
1418         Assert(ac < lengthof(av));
1419
1420         return postmaster_forkexec(ac, av);
1421 }
1422
1423 /*
1424  * We need this set from the outside, before InitProcess is called
1425  */
1426 void
1427 AutovacuumWorkerIAm(void)
1428 {
1429         am_autovacuum_worker = true;
1430 }
1431 #endif
1432
1433 /*
1434  * Main entry point for autovacuum worker process.
1435  *
1436  * This code is heavily based on pgarch.c, q.v.
1437  */
1438 int
1439 StartAutoVacWorker(void)
1440 {
1441         pid_t           worker_pid;
1442
1443 #ifdef EXEC_BACKEND
1444         switch ((worker_pid = avworker_forkexec()))
1445 #else
1446         switch ((worker_pid = fork_process()))
1447 #endif
1448         {
1449                 case -1:
1450                         ereport(LOG,
1451                                         (errmsg("could not fork autovacuum worker process: %m")));
1452                         return 0;
1453
1454 #ifndef EXEC_BACKEND
1455                 case 0:
1456                         /* in postmaster child ... */
1457                         /* Close the postmaster's sockets */
1458                         ClosePostmasterPorts(false);
1459
1460                         /* Lose the postmaster's on-exit routines */
1461                         on_exit_reset();
1462
1463                         AutoVacWorkerMain(0, NULL);
1464                         break;
1465 #endif
1466                 default:
1467                         return (int) worker_pid;
1468         }
1469
1470         /* shouldn't get here */
1471         return 0;
1472 }
1473
1474 /*
1475  * AutoVacWorkerMain
1476  */
1477 NON_EXEC_STATIC void
1478 AutoVacWorkerMain(int argc, char *argv[])
1479 {
1480         sigjmp_buf      local_sigjmp_buf;
1481         Oid                     dbid;
1482
1483         /* we are a postmaster subprocess now */
1484         IsUnderPostmaster = true;
1485         am_autovacuum_worker = true;
1486
1487         /* reset MyProcPid */
1488         MyProcPid = getpid();
1489
1490         /* record Start Time for logging */
1491         MyStartTime = time(NULL);
1492
1493         /* Identify myself via ps */
1494         init_ps_display("autovacuum worker process", "", "", "");
1495
1496         SetProcessingMode(InitProcessing);
1497
1498         /*
1499          * If possible, make this process a group leader, so that the postmaster
1500          * can signal any child processes too.  (autovacuum probably never has any
1501          * child processes, but for consistency we make all postmaster child
1502          * processes do this.)
1503          */
1504 #ifdef HAVE_SETSID
1505         if (setsid() < 0)
1506                 elog(FATAL, "setsid() failed: %m");
1507 #endif
1508
1509         /*
1510          * Set up signal handlers.      We operate on databases much like a regular
1511          * backend, so we use the same signal handling.  See equivalent code in
1512          * tcop/postgres.c.
1513          *
1514          * Currently, we don't pay attention to postgresql.conf changes that
1515          * happen during a single daemon iteration, so we can ignore SIGHUP.
1516          */
1517         pqsignal(SIGHUP, SIG_IGN);
1518
1519         /*
1520          * SIGINT is used to signal canceling the current table's vacuum; SIGTERM
1521          * means abort and exit cleanly, and SIGQUIT means abandon ship.
1522          */
1523         pqsignal(SIGINT, StatementCancelHandler);
1524         pqsignal(SIGTERM, die);
1525         pqsignal(SIGQUIT, quickdie);
1526         InitializeTimeouts();           /* establishes SIGALRM handler */
1527
1528         pqsignal(SIGPIPE, SIG_IGN);
1529         pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1530         pqsignal(SIGUSR2, SIG_IGN);
1531         pqsignal(SIGFPE, FloatExceptionHandler);
1532         pqsignal(SIGCHLD, SIG_DFL);
1533
1534         /* Early initialization */
1535         BaseInit();
1536
1537         /*
1538          * Create a per-backend PGPROC struct in shared memory, except in the
1539          * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
1540          * this before we can use LWLocks (and in the EXEC_BACKEND case we already
1541          * had to do some stuff with LWLocks).
1542          */
1543 #ifndef EXEC_BACKEND
1544         InitProcess();
1545 #endif
1546
1547         /*
1548          * If an exception is encountered, processing resumes here.
1549          *
1550          * See notes in postgres.c about the design of this coding.
1551          */
1552         if (sigsetjmp(local_sigjmp_buf, 1) != 0)
1553         {
1554                 /* Prevents interrupts while cleaning up */
1555                 HOLD_INTERRUPTS();
1556
1557                 /* Report the error to the server log */
1558                 EmitErrorReport();
1559
1560                 /*
1561                  * We can now go away.  Note that because we called InitProcess, a
1562                  * callback was registered to do ProcKill, which will clean up
1563                  * necessary state.
1564                  */
1565                 proc_exit(0);
1566         }
1567
1568         /* We can now handle ereport(ERROR) */
1569         PG_exception_stack = &local_sigjmp_buf;
1570
1571         PG_SETMASK(&UnBlockSig);
1572
1573         /*
1574          * Force zero_damaged_pages OFF in the autovac process, even if it is set
1575          * in postgresql.conf.  We don't really want such a dangerous option being
1576          * applied non-interactively.
1577          */
1578         SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
1579
1580         /*
1581          * Force statement_timeout and lock_timeout to zero to avoid letting these
1582          * settings prevent regular maintenance from being executed.
1583          */
1584         SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1585         SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1586
1587         /*
1588          * Force default_transaction_isolation to READ COMMITTED.  We don't want
1589          * to pay the overhead of serializable mode, nor add any risk of causing
1590          * deadlocks or delaying other transactions.
1591          */
1592         SetConfigOption("default_transaction_isolation", "read committed",
1593                                         PGC_SUSET, PGC_S_OVERRIDE);
1594
1595         /*
1596          * Force synchronous replication off to allow regular maintenance even if
1597          * we are waiting for standbys to connect. This is important to ensure we
1598          * aren't blocked from performing anti-wraparound tasks.
1599          */
1600         if (synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH)
1601                 SetConfigOption("synchronous_commit", "local",
1602                                                 PGC_SUSET, PGC_S_OVERRIDE);
1603
1604         /*
1605          * Get the info about the database we're going to work on.
1606          */
1607         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1608
1609         /*
1610          * beware of startingWorker being INVALID; this should normally not
1611          * happen, but if a worker fails after forking and before this, the
1612          * launcher might have decided to remove it from the queue and start
1613          * again.
1614          */
1615         if (AutoVacuumShmem->av_startingWorker != NULL)
1616         {
1617                 MyWorkerInfo = AutoVacuumShmem->av_startingWorker;
1618                 dbid = MyWorkerInfo->wi_dboid;
1619                 MyWorkerInfo->wi_proc = MyProc;
1620
1621                 /* insert into the running list */
1622                 dlist_push_head(&AutoVacuumShmem->av_runningWorkers,
1623                                                 &MyWorkerInfo->wi_links);
1624
1625                 /*
1626                  * remove from the "starting" pointer, so that the launcher can start
1627                  * a new worker if required
1628                  */
1629                 AutoVacuumShmem->av_startingWorker = NULL;
1630                 LWLockRelease(AutovacuumLock);
1631
1632                 on_shmem_exit(FreeWorkerInfo, 0);
1633
1634                 /* wake up the launcher */
1635                 if (AutoVacuumShmem->av_launcherpid != 0)
1636                         kill(AutoVacuumShmem->av_launcherpid, SIGUSR2);
1637         }
1638         else
1639         {
1640                 /* no worker entry for me, go away */
1641                 elog(WARNING, "autovacuum worker started without a worker entry");
1642                 dbid = InvalidOid;
1643                 LWLockRelease(AutovacuumLock);
1644         }
1645
1646         if (OidIsValid(dbid))
1647         {
1648                 char            dbname[NAMEDATALEN];
1649
1650                 /*
1651                  * Report autovac startup to the stats collector.  We deliberately do
1652                  * this before InitPostgres, so that the last_autovac_time will get
1653                  * updated even if the connection attempt fails.  This is to prevent
1654                  * autovac from getting "stuck" repeatedly selecting an unopenable
1655                  * database, rather than making any progress on stuff it can connect
1656                  * to.
1657                  */
1658                 pgstat_report_autovac(dbid);
1659
1660                 /*
1661                  * Connect to the selected database
1662                  *
1663                  * Note: if we have selected a just-deleted database (due to using
1664                  * stale stats info), we'll fail and exit here.
1665                  */
1666                 InitPostgres(NULL, dbid, NULL, dbname);
1667                 SetProcessingMode(NormalProcessing);
1668                 set_ps_display(dbname, false);
1669                 ereport(DEBUG1,
1670                                 (errmsg("autovacuum: processing database \"%s\"", dbname)));
1671
1672                 if (PostAuthDelay)
1673                         pg_usleep(PostAuthDelay * 1000000L);
1674
1675                 /* And do an appropriate amount of work */
1676                 recentXid = ReadNewTransactionId();
1677                 recentMulti = ReadNextMultiXactId();
1678                 do_autovacuum();
1679         }
1680
1681         /*
1682          * The launcher will be notified of my death in ProcKill, *if* we managed
1683          * to get a worker slot at all
1684          */
1685
1686         /* All done, go away */
1687         proc_exit(0);
1688 }
1689
1690 /*
1691  * Return a WorkerInfo to the free list
1692  */
1693 static void
1694 FreeWorkerInfo(int code, Datum arg)
1695 {
1696         if (MyWorkerInfo != NULL)
1697         {
1698                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1699
1700                 /*
1701                  * Wake the launcher up so that he can launch a new worker immediately
1702                  * if required.  We only save the launcher's PID in local memory here;
1703                  * the actual signal will be sent when the PGPROC is recycled.  Note
1704                  * that we always do this, so that the launcher can rebalance the cost
1705                  * limit setting of the remaining workers.
1706                  *
1707                  * We somewhat ignore the risk that the launcher changes its PID
1708                  * between us reading it and the actual kill; we expect ProcKill to be
1709                  * called shortly after us, and we assume that PIDs are not reused too
1710                  * quickly after a process exits.
1711                  */
1712                 AutovacuumLauncherPid = AutoVacuumShmem->av_launcherpid;
1713
1714                 dlist_delete(&MyWorkerInfo->wi_links);
1715                 MyWorkerInfo->wi_dboid = InvalidOid;
1716                 MyWorkerInfo->wi_tableoid = InvalidOid;
1717                 MyWorkerInfo->wi_proc = NULL;
1718                 MyWorkerInfo->wi_launchtime = 0;
1719                 MyWorkerInfo->wi_cost_delay = 0;
1720                 MyWorkerInfo->wi_cost_limit = 0;
1721                 MyWorkerInfo->wi_cost_limit_base = 0;
1722                 dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
1723                                                 &MyWorkerInfo->wi_links);
1724                 /* not mine anymore */
1725                 MyWorkerInfo = NULL;
1726
1727                 /*
1728                  * now that we're inactive, cause a rebalancing of the surviving
1729                  * workers
1730                  */
1731                 AutoVacuumShmem->av_signal[AutoVacRebalance] = true;
1732                 LWLockRelease(AutovacuumLock);
1733         }
1734 }
1735
1736 /*
1737  * Update the cost-based delay parameters, so that multiple workers consume
1738  * each a fraction of the total available I/O.
1739  */
1740 void
1741 AutoVacuumUpdateDelay(void)
1742 {
1743         if (MyWorkerInfo)
1744         {
1745                 VacuumCostDelay = MyWorkerInfo->wi_cost_delay;
1746                 VacuumCostLimit = MyWorkerInfo->wi_cost_limit;
1747         }
1748 }
1749
1750 /*
1751  * autovac_balance_cost
1752  *              Recalculate the cost limit setting for each active worker.
1753  *
1754  * Caller must hold the AutovacuumLock in exclusive mode.
1755  */
1756 static void
1757 autovac_balance_cost(void)
1758 {
1759         /*
1760          * The idea here is that we ration out I/O equally.  The amount of I/O
1761          * that a worker can consume is determined by cost_limit/cost_delay, so we
1762          * try to equalize those ratios rather than the raw limit settings.
1763          *
1764          * note: in cost_limit, zero also means use value from elsewhere, because
1765          * zero is not a valid value.
1766          */
1767         int                     vac_cost_limit = (autovacuum_vac_cost_limit > 0 ?
1768                                                                 autovacuum_vac_cost_limit : VacuumCostLimit);
1769         int                     vac_cost_delay = (autovacuum_vac_cost_delay >= 0 ?
1770                                                                 autovacuum_vac_cost_delay : VacuumCostDelay);
1771         double          cost_total;
1772         double          cost_avail;
1773         dlist_iter      iter;
1774
1775         /* not set? nothing to do */
1776         if (vac_cost_limit <= 0 || vac_cost_delay <= 0)
1777                 return;
1778
1779         /* caculate the total base cost limit of active workers */
1780         cost_total = 0.0;
1781         dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1782         {
1783                 WorkerInfo      worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1784
1785                 if (worker->wi_proc != NULL &&
1786                         worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1787                         cost_total +=
1788                                 (double) worker->wi_cost_limit_base / worker->wi_cost_delay;
1789         }
1790         /* there are no cost limits -- nothing to do */
1791         if (cost_total <= 0)
1792                 return;
1793
1794         /*
1795          * Adjust cost limit of each active worker to balance the total of cost
1796          * limit to autovacuum_vacuum_cost_limit.
1797          */
1798         cost_avail = (double) vac_cost_limit / vac_cost_delay;
1799         dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1800         {
1801                 WorkerInfo      worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1802
1803                 if (worker->wi_proc != NULL &&
1804                         worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1805                 {
1806                         int                     limit = (int)
1807                         (cost_avail * worker->wi_cost_limit_base / cost_total);
1808
1809                         /*
1810                          * We put a lower bound of 1 on the cost_limit, to avoid division-
1811                          * by-zero in the vacuum code.  Also, in case of roundoff trouble
1812                          * in these calculations, let's be sure we don't ever set
1813                          * cost_limit to more than the base value.
1814                          */
1815                         worker->wi_cost_limit = Max(Min(limit,
1816                                                                                         worker->wi_cost_limit_base),
1817                                                                                 1);
1818
1819                         elog(DEBUG2, "autovac_balance_cost(pid=%u db=%u, rel=%u, cost_limit=%d, cost_limit_base=%d, cost_delay=%d)",
1820                                  worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid,
1821                                  worker->wi_cost_limit, worker->wi_cost_limit_base,
1822                                  worker->wi_cost_delay);
1823                 }
1824         }
1825 }
1826
1827 /*
1828  * get_database_list
1829  *              Return a list of all databases found in pg_database.
1830  *
1831  * The list and associated data is allocated in the caller's memory context,
1832  * which is in charge of ensuring that it's properly cleaned up afterwards.
1833  *
1834  * Note: this is the only function in which the autovacuum launcher uses a
1835  * transaction.  Although we aren't attached to any particular database and
1836  * therefore can't access most catalogs, we do have enough infrastructure
1837  * to do a seqscan on pg_database.
1838  */
1839 static List *
1840 get_database_list(void)
1841 {
1842         List       *dblist = NIL;
1843         Relation        rel;
1844         HeapScanDesc scan;
1845         HeapTuple       tup;
1846         MemoryContext resultcxt;
1847
1848         /* This is the context that we will allocate our output data in */
1849         resultcxt = CurrentMemoryContext;
1850
1851         /*
1852          * Start a transaction so we can access pg_database, and get a snapshot.
1853          * We don't have a use for the snapshot itself, but we're interested in
1854          * the secondary effect that it sets RecentGlobalXmin.  (This is critical
1855          * for anything that reads heap pages, because HOT may decide to prune
1856          * them even if the process doesn't attempt to modify any tuples.)
1857          */
1858         StartTransactionCommand();
1859         (void) GetTransactionSnapshot();
1860
1861         rel = heap_open(DatabaseRelationId, AccessShareLock);
1862         scan = heap_beginscan_catalog(rel, 0, NULL);
1863
1864         while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
1865         {
1866                 Form_pg_database pgdatabase = (Form_pg_database) GETSTRUCT(tup);
1867                 avw_dbase  *avdb;
1868                 MemoryContext oldcxt;
1869
1870                 /*
1871                  * Allocate our results in the caller's context, not the
1872                  * transaction's. We do this inside the loop, and restore the original
1873                  * context at the end, so that leaky things like heap_getnext() are
1874                  * not called in a potentially long-lived context.
1875                  */
1876                 oldcxt = MemoryContextSwitchTo(resultcxt);
1877
1878                 avdb = (avw_dbase *) palloc(sizeof(avw_dbase));
1879
1880                 avdb->adw_datid = HeapTupleGetOid(tup);
1881                 avdb->adw_name = pstrdup(NameStr(pgdatabase->datname));
1882                 avdb->adw_frozenxid = pgdatabase->datfrozenxid;
1883                 avdb->adw_minmulti = pgdatabase->datminmxid;
1884                 /* this gets set later: */
1885                 avdb->adw_entry = NULL;
1886
1887                 dblist = lappend(dblist, avdb);
1888                 MemoryContextSwitchTo(oldcxt);
1889         }
1890
1891         heap_endscan(scan);
1892         heap_close(rel, AccessShareLock);
1893
1894         CommitTransactionCommand();
1895
1896         return dblist;
1897 }
1898
1899 /*
1900  * Process a database table-by-table
1901  *
1902  * Note that CHECK_FOR_INTERRUPTS is supposed to be used in certain spots in
1903  * order not to ignore shutdown commands for too long.
1904  */
1905 static void
1906 do_autovacuum(void)
1907 {
1908         Relation        classRel;
1909         HeapTuple       tuple;
1910         HeapScanDesc relScan;
1911         Form_pg_database dbForm;
1912         List       *table_oids = NIL;
1913         HASHCTL         ctl;
1914         HTAB       *table_toast_map;
1915         ListCell   *volatile cell;
1916         PgStat_StatDBEntry *shared;
1917         PgStat_StatDBEntry *dbentry;
1918         BufferAccessStrategy bstrategy;
1919         ScanKeyData key;
1920         TupleDesc       pg_class_desc;
1921
1922         /*
1923          * StartTransactionCommand and CommitTransactionCommand will automatically
1924          * switch to other contexts.  We need this one to keep the list of
1925          * relations to vacuum/analyze across transactions.
1926          */
1927         AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
1928                                                                                   "AV worker",
1929                                                                                   ALLOCSET_DEFAULT_MINSIZE,
1930                                                                                   ALLOCSET_DEFAULT_INITSIZE,
1931                                                                                   ALLOCSET_DEFAULT_MAXSIZE);
1932         MemoryContextSwitchTo(AutovacMemCxt);
1933
1934         /*
1935          * may be NULL if we couldn't find an entry (only happens if we are
1936          * forcing a vacuum for anti-wrap purposes).
1937          */
1938         dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
1939
1940         /* Start a transaction so our commands have one to play into. */
1941         StartTransactionCommand();
1942
1943         /*
1944          * Clean up any dead statistics collector entries for this DB. We always
1945          * want to do this exactly once per DB-processing cycle, even if we find
1946          * nothing worth vacuuming in the database.
1947          */
1948         pgstat_vacuum_stat();
1949
1950         /*
1951          * Find the pg_database entry and select the default freeze ages. We use
1952          * zero in template and nonconnectable databases, else the system-wide
1953          * default.
1954          */
1955         tuple = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
1956         if (!HeapTupleIsValid(tuple))
1957                 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
1958         dbForm = (Form_pg_database) GETSTRUCT(tuple);
1959
1960         if (dbForm->datistemplate || !dbForm->datallowconn)
1961         {
1962                 default_freeze_min_age = 0;
1963                 default_freeze_table_age = 0;
1964                 default_multixact_freeze_min_age = 0;
1965                 default_multixact_freeze_table_age = 0;
1966         }
1967         else
1968         {
1969                 default_freeze_min_age = vacuum_freeze_min_age;
1970                 default_freeze_table_age = vacuum_freeze_table_age;
1971                 default_multixact_freeze_min_age = vacuum_multixact_freeze_min_age;
1972                 default_multixact_freeze_table_age = vacuum_multixact_freeze_table_age;
1973         }
1974
1975         ReleaseSysCache(tuple);
1976
1977         /* StartTransactionCommand changed elsewhere */
1978         MemoryContextSwitchTo(AutovacMemCxt);
1979
1980         /* The database hash where pgstat keeps shared relations */
1981         shared = pgstat_fetch_stat_dbentry(InvalidOid);
1982
1983         classRel = heap_open(RelationRelationId, AccessShareLock);
1984
1985         /* create a copy so we can use it after closing pg_class */
1986         pg_class_desc = CreateTupleDescCopy(RelationGetDescr(classRel));
1987
1988         /* create hash table for toast <-> main relid mapping */
1989         MemSet(&ctl, 0, sizeof(ctl));
1990         ctl.keysize = sizeof(Oid);
1991         ctl.entrysize = sizeof(av_relation);
1992         ctl.hash = oid_hash;
1993
1994         table_toast_map = hash_create("TOAST to main relid map",
1995                                                                   100,
1996                                                                   &ctl,
1997                                                                   HASH_ELEM | HASH_FUNCTION);
1998
1999         /*
2000          * Scan pg_class to determine which tables to vacuum.
2001          *
2002          * We do this in two passes: on the first one we collect the list of plain
2003          * relations and materialized views, and on the second one we collect
2004          * TOAST tables. The reason for doing the second pass is that during it we
2005          * want to use the main relation's pg_class.reloptions entry if the TOAST
2006          * table does not have any, and we cannot obtain it unless we know
2007          * beforehand what's the main  table OID.
2008          *
2009          * We need to check TOAST tables separately because in cases with short,
2010          * wide tables there might be proportionally much more activity in the
2011          * TOAST table than in its parent.
2012          */
2013         relScan = heap_beginscan_catalog(classRel, 0, NULL);
2014
2015         /*
2016          * On the first pass, we collect main tables to vacuum, and also the main
2017          * table relid to TOAST relid mapping.
2018          */
2019         while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2020         {
2021                 Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2022                 PgStat_StatTabEntry *tabentry;
2023                 AutoVacOpts *relopts;
2024                 Oid                     relid;
2025                 bool            dovacuum;
2026                 bool            doanalyze;
2027                 bool            wraparound;
2028
2029                 if (classForm->relkind != RELKIND_RELATION &&
2030                         classForm->relkind != RELKIND_MATVIEW)
2031                         continue;
2032
2033                 relid = HeapTupleGetOid(tuple);
2034
2035                 /* Fetch reloptions and the pgstat entry for this table */
2036                 relopts = extract_autovac_opts(tuple, pg_class_desc);
2037                 tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2038                                                                                          shared, dbentry);
2039
2040                 /* Check if it needs vacuum or analyze */
2041                 relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2042                                                                   &dovacuum, &doanalyze, &wraparound);
2043
2044                 /*
2045                  * Check if it is a temp table (presumably, of some other backend's).
2046                  * We cannot safely process other backends' temp tables.
2047                  */
2048                 if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2049                 {
2050                         int                     backendID;
2051
2052                         backendID = GetTempNamespaceBackendId(classForm->relnamespace);
2053
2054                         /* We just ignore it if the owning backend is still active */
2055                         if (backendID == MyBackendId || BackendIdGetProc(backendID) == NULL)
2056                         {
2057                                 /*
2058                                  * We found an orphan temp table (which was probably left
2059                                  * behind by a crashed backend).  If it's so old as to need
2060                                  * vacuum for wraparound, forcibly drop it.  Otherwise just
2061                                  * log a complaint.
2062                                  */
2063                                 if (wraparound)
2064                                 {
2065                                         ObjectAddress object;
2066
2067                                         ereport(LOG,
2068                                                         (errmsg("autovacuum: dropping orphan temp table \"%s\".\"%s\" in database \"%s\"",
2069                                                                  get_namespace_name(classForm->relnamespace),
2070                                                                         NameStr(classForm->relname),
2071                                                                         get_database_name(MyDatabaseId))));
2072                                         object.classId = RelationRelationId;
2073                                         object.objectId = relid;
2074                                         object.objectSubId = 0;
2075                                         performDeletion(&object, DROP_CASCADE, PERFORM_DELETION_INTERNAL);
2076                                 }
2077                                 else
2078                                 {
2079                                         ereport(LOG,
2080                                                         (errmsg("autovacuum: found orphan temp table \"%s\".\"%s\" in database \"%s\"",
2081                                                                  get_namespace_name(classForm->relnamespace),
2082                                                                         NameStr(classForm->relname),
2083                                                                         get_database_name(MyDatabaseId))));
2084                                 }
2085                         }
2086                 }
2087                 else
2088                 {
2089                         /* relations that need work are added to table_oids */
2090                         if (dovacuum || doanalyze)
2091                                 table_oids = lappend_oid(table_oids, relid);
2092
2093                         /*
2094                          * Remember the association for the second pass.  Note: we must do
2095                          * this even if the table is going to be vacuumed, because we
2096                          * don't automatically vacuum toast tables along the parent table.
2097                          */
2098                         if (OidIsValid(classForm->reltoastrelid))
2099                         {
2100                                 av_relation *hentry;
2101                                 bool            found;
2102
2103                                 hentry = hash_search(table_toast_map,
2104                                                                          &classForm->reltoastrelid,
2105                                                                          HASH_ENTER, &found);
2106
2107                                 if (!found)
2108                                 {
2109                                         /* hash_search already filled in the key */
2110                                         hentry->ar_relid = relid;
2111                                         hentry->ar_hasrelopts = false;
2112                                         if (relopts != NULL)
2113                                         {
2114                                                 hentry->ar_hasrelopts = true;
2115                                                 memcpy(&hentry->ar_reloptions, relopts,
2116                                                            sizeof(AutoVacOpts));
2117                                         }
2118                                 }
2119                         }
2120                 }
2121         }
2122
2123         heap_endscan(relScan);
2124
2125         /* second pass: check TOAST tables */
2126         ScanKeyInit(&key,
2127                                 Anum_pg_class_relkind,
2128                                 BTEqualStrategyNumber, F_CHAREQ,
2129                                 CharGetDatum(RELKIND_TOASTVALUE));
2130
2131         relScan = heap_beginscan_catalog(classRel, 1, &key);
2132         while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2133         {
2134                 Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2135                 PgStat_StatTabEntry *tabentry;
2136                 Oid                     relid;
2137                 AutoVacOpts *relopts = NULL;
2138                 bool            dovacuum;
2139                 bool            doanalyze;
2140                 bool            wraparound;
2141
2142                 /*
2143                  * We cannot safely process other backends' temp tables, so skip 'em.
2144                  */
2145                 if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2146                         continue;
2147
2148                 relid = HeapTupleGetOid(tuple);
2149
2150                 /*
2151                  * fetch reloptions -- if this toast table does not have them, try the
2152                  * main rel
2153                  */
2154                 relopts = extract_autovac_opts(tuple, pg_class_desc);
2155                 if (relopts == NULL)
2156                 {
2157                         av_relation *hentry;
2158                         bool            found;
2159
2160                         hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2161                         if (found && hentry->ar_hasrelopts)
2162                                 relopts = &hentry->ar_reloptions;
2163                 }
2164
2165                 /* Fetch the pgstat entry for this table */
2166                 tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2167                                                                                          shared, dbentry);
2168
2169                 relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2170                                                                   &dovacuum, &doanalyze, &wraparound);
2171
2172                 /* ignore analyze for toast tables */
2173                 if (dovacuum)
2174                         table_oids = lappend_oid(table_oids, relid);
2175         }
2176
2177         heap_endscan(relScan);
2178         heap_close(classRel, AccessShareLock);
2179
2180         /*
2181          * Create a buffer access strategy object for VACUUM to use.  We want to
2182          * use the same one across all the vacuum operations we perform, since the
2183          * point is for VACUUM not to blow out the shared cache.
2184          */
2185         bstrategy = GetAccessStrategy(BAS_VACUUM);
2186
2187         /*
2188          * create a memory context to act as fake PortalContext, so that the
2189          * contexts created in the vacuum code are cleaned up for each table.
2190          */
2191         PortalContext = AllocSetContextCreate(AutovacMemCxt,
2192                                                                                   "Autovacuum Portal",
2193                                                                                   ALLOCSET_DEFAULT_INITSIZE,
2194                                                                                   ALLOCSET_DEFAULT_MINSIZE,
2195                                                                                   ALLOCSET_DEFAULT_MAXSIZE);
2196
2197         /*
2198          * Perform operations on collected tables.
2199          */
2200         foreach(cell, table_oids)
2201         {
2202                 Oid                     relid = lfirst_oid(cell);
2203                 autovac_table *tab;
2204                 bool            skipit;
2205                 int                     stdVacuumCostDelay;
2206                 int                     stdVacuumCostLimit;
2207                 dlist_iter      iter;
2208
2209                 CHECK_FOR_INTERRUPTS();
2210
2211                 /*
2212                  * hold schedule lock from here until we're sure that this table still
2213                  * needs vacuuming.  We also need the AutovacuumLock to walk the
2214                  * worker array, but we'll let go of that one quickly.
2215                  */
2216                 LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2217                 LWLockAcquire(AutovacuumLock, LW_SHARED);
2218
2219                 /*
2220                  * Check whether the table is being vacuumed concurrently by another
2221                  * worker.
2222                  */
2223                 skipit = false;
2224                 dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
2225                 {
2226                         WorkerInfo      worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
2227
2228                         /* ignore myself */
2229                         if (worker == MyWorkerInfo)
2230                                 continue;
2231
2232                         /* ignore workers in other databases */
2233                         if (worker->wi_dboid != MyDatabaseId)
2234                                 continue;
2235
2236                         if (worker->wi_tableoid == relid)
2237                         {
2238                                 skipit = true;
2239                                 break;
2240                         }
2241                 }
2242                 LWLockRelease(AutovacuumLock);
2243                 if (skipit)
2244                 {
2245                         LWLockRelease(AutovacuumScheduleLock);
2246                         continue;
2247                 }
2248
2249                 /*
2250                  * Check whether pgstat data still says we need to vacuum this table.
2251                  * It could have changed if something else processed the table while
2252                  * we weren't looking.
2253                  *
2254                  * Note: we have a special case in pgstat code to ensure that the
2255                  * stats we read are as up-to-date as possible, to avoid the problem
2256                  * that somebody just finished vacuuming this table.  The window to
2257                  * the race condition is not closed but it is very small.
2258                  */
2259                 MemoryContextSwitchTo(AutovacMemCxt);
2260                 tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc);
2261                 if (tab == NULL)
2262                 {
2263                         /* someone else vacuumed the table, or it went away */
2264                         LWLockRelease(AutovacuumScheduleLock);
2265                         continue;
2266                 }
2267
2268                 /*
2269                  * Ok, good to go.      Store the table in shared memory before releasing
2270                  * the lock so that other workers don't vacuum it concurrently.
2271                  */
2272                 MyWorkerInfo->wi_tableoid = relid;
2273                 LWLockRelease(AutovacuumScheduleLock);
2274
2275                 /*
2276                  * Remember the prevailing values of the vacuum cost GUCs.      We have to
2277                  * restore these at the bottom of the loop, else we'll compute wrong
2278                  * values in the next iteration of autovac_balance_cost().
2279                  */
2280                 stdVacuumCostDelay = VacuumCostDelay;
2281                 stdVacuumCostLimit = VacuumCostLimit;
2282
2283                 /* Must hold AutovacuumLock while mucking with cost balance info */
2284                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2285
2286                 /* advertise my cost delay parameters for the balancing algorithm */
2287                 MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay;
2288                 MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit;
2289                 MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit;
2290
2291                 /* do a balance */
2292                 autovac_balance_cost();
2293
2294                 /* set the active cost parameters from the result of that */
2295                 AutoVacuumUpdateDelay();
2296
2297                 /* done */
2298                 LWLockRelease(AutovacuumLock);
2299
2300                 /* clean up memory before each iteration */
2301                 MemoryContextResetAndDeleteChildren(PortalContext);
2302
2303                 /*
2304                  * Save the relation name for a possible error message, to avoid a
2305                  * catalog lookup in case of an error.  If any of these return NULL,
2306                  * then the relation has been dropped since last we checked; skip it.
2307                  * Note: they must live in a long-lived memory context because we call
2308                  * vacuum and analyze in different transactions.
2309                  */
2310
2311                 tab->at_relname = get_rel_name(tab->at_relid);
2312                 tab->at_nspname = get_namespace_name(get_rel_namespace(tab->at_relid));
2313                 tab->at_datname = get_database_name(MyDatabaseId);
2314                 if (!tab->at_relname || !tab->at_nspname || !tab->at_datname)
2315                         goto deleted;
2316
2317                 /*
2318                  * We will abort vacuuming the current table if something errors out,
2319                  * and continue with the next one in schedule; in particular, this
2320                  * happens if we are interrupted with SIGINT.
2321                  */
2322                 PG_TRY();
2323                 {
2324                         /* have at it */
2325                         MemoryContextSwitchTo(TopTransactionContext);
2326                         autovacuum_do_vac_analyze(tab, bstrategy);
2327
2328                         /*
2329                          * Clear a possible query-cancel signal, to avoid a late reaction
2330                          * to an automatically-sent signal because of vacuuming the
2331                          * current table (we're done with it, so it would make no sense to
2332                          * cancel at this point.)
2333                          */
2334                         QueryCancelPending = false;
2335                 }
2336                 PG_CATCH();
2337                 {
2338                         /*
2339                          * Abort the transaction, start a new one, and proceed with the
2340                          * next table in our list.
2341                          */
2342                         HOLD_INTERRUPTS();
2343                         if (tab->at_dovacuum)
2344                                 errcontext("automatic vacuum of table \"%s.%s.%s\"",
2345                                                    tab->at_datname, tab->at_nspname, tab->at_relname);
2346                         else
2347                                 errcontext("automatic analyze of table \"%s.%s.%s\"",
2348                                                    tab->at_datname, tab->at_nspname, tab->at_relname);
2349                         EmitErrorReport();
2350
2351                         /* this resets the PGXACT flags too */
2352                         AbortOutOfAnyTransaction();
2353                         FlushErrorState();
2354                         MemoryContextResetAndDeleteChildren(PortalContext);
2355
2356                         /* restart our transaction for the following operations */
2357                         StartTransactionCommand();
2358                         RESUME_INTERRUPTS();
2359                 }
2360                 PG_END_TRY();
2361
2362                 /* the PGXACT flags are reset at the next end of transaction */
2363
2364                 /* be tidy */
2365 deleted:
2366                 if (tab->at_datname != NULL)
2367                         pfree(tab->at_datname);
2368                 if (tab->at_nspname != NULL)
2369                         pfree(tab->at_nspname);
2370                 if (tab->at_relname != NULL)
2371                         pfree(tab->at_relname);
2372                 pfree(tab);
2373
2374                 /*
2375                  * Remove my info from shared memory.  We could, but intentionally
2376                  * don't, clear wi_cost_limit and friends --- this is on the
2377                  * assumption that we probably have more to do with similar cost
2378                  * settings, so we don't want to give up our share of I/O for a very
2379                  * short interval and thereby thrash the global balance.
2380                  */
2381                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2382                 MyWorkerInfo->wi_tableoid = InvalidOid;
2383                 LWLockRelease(AutovacuumLock);
2384
2385                 /* restore vacuum cost GUCs for the next iteration */
2386                 VacuumCostDelay = stdVacuumCostDelay;
2387                 VacuumCostLimit = stdVacuumCostLimit;
2388         }
2389
2390         /*
2391          * We leak table_toast_map here (among other things), but since we're
2392          * going away soon, it's not a problem.
2393          */
2394
2395         /*
2396          * Update pg_database.datfrozenxid, and truncate pg_clog if possible. We
2397          * only need to do this once, not after each table.
2398          */
2399         vac_update_datfrozenxid();
2400
2401         /* Finally close out the last transaction. */
2402         CommitTransactionCommand();
2403 }
2404
2405 /*
2406  * extract_autovac_opts
2407  *
2408  * Given a relation's pg_class tuple, return the AutoVacOpts portion of
2409  * reloptions, if set; otherwise, return NULL.
2410  */
2411 static AutoVacOpts *
2412 extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc)
2413 {
2414         bytea      *relopts;
2415         AutoVacOpts *av;
2416
2417         Assert(((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_RELATION ||
2418                    ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW ||
2419                    ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE);
2420
2421         relopts = extractRelOptions(tup, pg_class_desc, InvalidOid);
2422         if (relopts == NULL)
2423                 return NULL;
2424
2425         av = palloc(sizeof(AutoVacOpts));
2426         memcpy(av, &(((StdRdOptions *) relopts)->autovacuum), sizeof(AutoVacOpts));
2427         pfree(relopts);
2428
2429         return av;
2430 }
2431
2432 /*
2433  * get_pgstat_tabentry_relid
2434  *
2435  * Fetch the pgstat entry of a table, either local to a database or shared.
2436  */
2437 static PgStat_StatTabEntry *
2438 get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
2439                                                   PgStat_StatDBEntry *dbentry)
2440 {
2441         PgStat_StatTabEntry *tabentry = NULL;
2442
2443         if (isshared)
2444         {
2445                 if (PointerIsValid(shared))
2446                         tabentry = hash_search(shared->tables, &relid,
2447                                                                    HASH_FIND, NULL);
2448         }
2449         else if (PointerIsValid(dbentry))
2450                 tabentry = hash_search(dbentry->tables, &relid,
2451                                                            HASH_FIND, NULL);
2452
2453         return tabentry;
2454 }
2455
2456 /*
2457  * table_recheck_autovac
2458  *
2459  * Recheck whether a table still needs vacuum or analyze.  Return value is a
2460  * valid autovac_table pointer if it does, NULL otherwise.
2461  *
2462  * Note that the returned autovac_table does not have the name fields set.
2463  */
2464 static autovac_table *
2465 table_recheck_autovac(Oid relid, HTAB *table_toast_map,
2466                                           TupleDesc pg_class_desc)
2467 {
2468         Form_pg_class classForm;
2469         HeapTuple       classTup;
2470         bool            dovacuum;
2471         bool            doanalyze;
2472         autovac_table *tab = NULL;
2473         PgStat_StatTabEntry *tabentry;
2474         PgStat_StatDBEntry *shared;
2475         PgStat_StatDBEntry *dbentry;
2476         bool            wraparound;
2477         AutoVacOpts *avopts;
2478
2479         /* use fresh stats */
2480         autovac_refresh_stats();
2481
2482         shared = pgstat_fetch_stat_dbentry(InvalidOid);
2483         dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
2484
2485         /* fetch the relation's relcache entry */
2486         classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
2487         if (!HeapTupleIsValid(classTup))
2488                 return NULL;
2489         classForm = (Form_pg_class) GETSTRUCT(classTup);
2490
2491         /*
2492          * Get the applicable reloptions.  If it is a TOAST table, try to get the
2493          * main table reloptions if the toast table itself doesn't have.
2494          */
2495         avopts = extract_autovac_opts(classTup, pg_class_desc);
2496         if (classForm->relkind == RELKIND_TOASTVALUE &&
2497                 avopts == NULL && table_toast_map != NULL)
2498         {
2499                 av_relation *hentry;
2500                 bool            found;
2501
2502                 hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2503                 if (found && hentry->ar_hasrelopts)
2504                         avopts = &hentry->ar_reloptions;
2505         }
2506
2507         /* fetch the pgstat table entry */
2508         tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2509                                                                                  shared, dbentry);
2510
2511         relation_needs_vacanalyze(relid, avopts, classForm, tabentry,
2512                                                           &dovacuum, &doanalyze, &wraparound);
2513
2514         /* ignore ANALYZE for toast tables */
2515         if (classForm->relkind == RELKIND_TOASTVALUE)
2516                 doanalyze = false;
2517
2518         /* OK, it needs something done */
2519         if (doanalyze || dovacuum)
2520         {
2521                 int                     freeze_min_age;
2522                 int                     freeze_table_age;
2523                 int                     multixact_freeze_min_age;
2524                 int                     multixact_freeze_table_age;
2525                 int                     vac_cost_limit;
2526                 int                     vac_cost_delay;
2527
2528                 /*
2529                  * Calculate the vacuum cost parameters and the freeze ages.  If there
2530                  * are options set in pg_class.reloptions, use them; in the case of a
2531                  * toast table, try the main table too.  Otherwise use the GUC
2532                  * defaults, autovacuum's own first and plain vacuum second.
2533                  */
2534
2535                 /* -1 in autovac setting means use plain vacuum_cost_delay */
2536                 vac_cost_delay = (avopts && avopts->vacuum_cost_delay >= 0)
2537                         ? avopts->vacuum_cost_delay
2538                         : (autovacuum_vac_cost_delay >= 0)
2539                         ? autovacuum_vac_cost_delay
2540                         : VacuumCostDelay;
2541
2542                 /* 0 or -1 in autovac setting means use plain vacuum_cost_limit */
2543                 vac_cost_limit = (avopts && avopts->vacuum_cost_limit > 0)
2544                         ? avopts->vacuum_cost_limit
2545                         : (autovacuum_vac_cost_limit > 0)
2546                         ? autovacuum_vac_cost_limit
2547                         : VacuumCostLimit;
2548
2549                 /* these do not have autovacuum-specific settings */
2550                 freeze_min_age = (avopts && avopts->freeze_min_age >= 0)
2551                         ? avopts->freeze_min_age
2552                         : default_freeze_min_age;
2553
2554                 freeze_table_age = (avopts && avopts->freeze_table_age >= 0)
2555                         ? avopts->freeze_table_age
2556                         : default_freeze_table_age;
2557
2558                 multixact_freeze_min_age = (avopts &&
2559                                                                         avopts->multixact_freeze_min_age >= 0)
2560                         ? avopts->multixact_freeze_min_age
2561                         : default_multixact_freeze_min_age;
2562
2563                 multixact_freeze_table_age = (avopts &&
2564                                                                           avopts->multixact_freeze_table_age >= 0)
2565                         ? avopts->multixact_freeze_table_age
2566                         : default_multixact_freeze_table_age;
2567
2568                 tab = palloc(sizeof(autovac_table));
2569                 tab->at_relid = relid;
2570                 tab->at_dovacuum = dovacuum;
2571                 tab->at_doanalyze = doanalyze;
2572                 tab->at_freeze_min_age = freeze_min_age;
2573                 tab->at_freeze_table_age = freeze_table_age;
2574                 tab->at_multixact_freeze_min_age = multixact_freeze_min_age;
2575                 tab->at_multixact_freeze_table_age = multixact_freeze_table_age;
2576                 tab->at_vacuum_cost_limit = vac_cost_limit;
2577                 tab->at_vacuum_cost_delay = vac_cost_delay;
2578                 tab->at_wraparound = wraparound;
2579                 tab->at_relname = NULL;
2580                 tab->at_nspname = NULL;
2581                 tab->at_datname = NULL;
2582         }
2583
2584         heap_freetuple(classTup);
2585
2586         return tab;
2587 }
2588
2589 /*
2590  * relation_needs_vacanalyze
2591  *
2592  * Check whether a relation needs to be vacuumed or analyzed; return each into
2593  * "dovacuum" and "doanalyze", respectively.  Also return whether the vacuum is
2594  * being forced because of Xid or multixact wraparound.
2595  *
2596  * relopts is a pointer to the AutoVacOpts options (either for itself in the
2597  * case of a plain table, or for either itself or its parent table in the case
2598  * of a TOAST table), NULL if none; tabentry is the pgstats entry, which can be
2599  * NULL.
2600  *
2601  * A table needs to be vacuumed if the number of dead tuples exceeds a
2602  * threshold.  This threshold is calculated as
2603  *
2604  * threshold = vac_base_thresh + vac_scale_factor * reltuples
2605  *
2606  * For analyze, the analysis done is that the number of tuples inserted,
2607  * deleted and updated since the last analyze exceeds a threshold calculated
2608  * in the same fashion as above.  Note that the collector actually stores
2609  * the number of tuples (both live and dead) that there were as of the last
2610  * analyze.  This is asymmetric to the VACUUM case.
2611  *
2612  * We also force vacuum if the table's relfrozenxid is more than freeze_max_age
2613  * transactions back, and if its relminmxid is more than
2614  * multixact_freeze_max_age multixacts back.
2615  *
2616  * A table whose autovacuum_enabled option is false is
2617  * automatically skipped (unless we have to vacuum it due to freeze_max_age).
2618  * Thus autovacuum can be disabled for specific tables. Also, when the stats
2619  * collector does not have data about a table, it will be skipped.
2620  *
2621  * A table whose vac_base_thresh value is < 0 takes the base value from the
2622  * autovacuum_vacuum_threshold GUC variable.  Similarly, a vac_scale_factor
2623  * value < 0 is substituted with the value of
2624  * autovacuum_vacuum_scale_factor GUC variable.  Ditto for analyze.
2625  */
2626 static void
2627 relation_needs_vacanalyze(Oid relid,
2628                                                   AutoVacOpts *relopts,
2629                                                   Form_pg_class classForm,
2630                                                   PgStat_StatTabEntry *tabentry,
2631  /* output params below */
2632                                                   bool *dovacuum,
2633                                                   bool *doanalyze,
2634                                                   bool *wraparound)
2635 {
2636         bool            force_vacuum;
2637         bool            av_enabled;
2638         float4          reltuples;              /* pg_class.reltuples */
2639
2640         /* constants from reloptions or GUC variables */
2641         int                     vac_base_thresh,
2642                                 anl_base_thresh;
2643         float4          vac_scale_factor,
2644                                 anl_scale_factor;
2645
2646         /* thresholds calculated from above constants */
2647         float4          vacthresh,
2648                                 anlthresh;
2649
2650         /* number of vacuum (resp. analyze) tuples at this time */
2651         float4          vactuples,
2652                                 anltuples;
2653
2654         /* freeze parameters */
2655         int                     freeze_max_age;
2656         int                     multixact_freeze_max_age;
2657         TransactionId xidForceLimit;
2658         MultiXactId multiForceLimit;
2659
2660         AssertArg(classForm != NULL);
2661         AssertArg(OidIsValid(relid));
2662
2663         /*
2664          * Determine vacuum/analyze equation parameters.  We have two possible
2665          * sources: the passed reloptions (which could be a main table or a toast
2666          * table), or the autovacuum GUC variables.
2667          */
2668
2669         /* -1 in autovac setting means use plain vacuum_cost_delay */
2670         vac_scale_factor = (relopts && relopts->vacuum_scale_factor >= 0)
2671                 ? relopts->vacuum_scale_factor
2672                 : autovacuum_vac_scale;
2673
2674         vac_base_thresh = (relopts && relopts->vacuum_threshold >= 0)
2675                 ? relopts->vacuum_threshold
2676                 : autovacuum_vac_thresh;
2677
2678         anl_scale_factor = (relopts && relopts->analyze_scale_factor >= 0)
2679                 ? relopts->analyze_scale_factor
2680                 : autovacuum_anl_scale;
2681
2682         anl_base_thresh = (relopts && relopts->analyze_threshold >= 0)
2683                 ? relopts->analyze_threshold
2684                 : autovacuum_anl_thresh;
2685
2686         freeze_max_age = (relopts && relopts->freeze_max_age >= 0)
2687                 ? Min(relopts->freeze_max_age, autovacuum_freeze_max_age)
2688                 : autovacuum_freeze_max_age;
2689
2690         multixact_freeze_max_age = (relopts && relopts->multixact_freeze_max_age >= 0)
2691                 ? Min(relopts->multixact_freeze_max_age, autovacuum_multixact_freeze_max_age)
2692                 : autovacuum_multixact_freeze_max_age;
2693
2694         av_enabled = (relopts ? relopts->enabled : true);
2695
2696         /* Force vacuum if table is at risk of wraparound */
2697         xidForceLimit = recentXid - freeze_max_age;
2698         if (xidForceLimit < FirstNormalTransactionId)
2699                 xidForceLimit -= FirstNormalTransactionId;
2700         force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) &&
2701                                         TransactionIdPrecedes(classForm->relfrozenxid,
2702                                                                                   xidForceLimit));
2703         if (!force_vacuum)
2704         {
2705                 multiForceLimit = recentMulti - multixact_freeze_max_age;
2706                 if (multiForceLimit < FirstMultiXactId)
2707                         multiForceLimit -= FirstMultiXactId;
2708                 force_vacuum = MultiXactIdPrecedes(classForm->relminmxid,
2709                                                                                    multiForceLimit);
2710         }
2711         *wraparound = force_vacuum;
2712
2713         /* User disabled it in pg_class.reloptions?  (But ignore if at risk) */
2714         if (!force_vacuum && !av_enabled)
2715         {
2716                 *doanalyze = false;
2717                 *dovacuum = false;
2718                 return;
2719         }
2720
2721         if (PointerIsValid(tabentry))
2722         {
2723                 reltuples = classForm->reltuples;
2724                 vactuples = tabentry->n_dead_tuples;
2725                 anltuples = tabentry->changes_since_analyze;
2726
2727                 vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples;
2728                 anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples;
2729
2730                 /*
2731                  * Note that we don't need to take special consideration for stat
2732                  * reset, because if that happens, the last vacuum and analyze counts
2733                  * will be reset too.
2734                  */
2735                 elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), anl: %.0f (threshold %.0f)",
2736                          NameStr(classForm->relname),
2737                          vactuples, vacthresh, anltuples, anlthresh);
2738
2739                 /* Determine if this table needs vacuum or analyze. */
2740                 *dovacuum = force_vacuum || (vactuples > vacthresh);
2741                 *doanalyze = (anltuples > anlthresh);
2742         }
2743         else
2744         {
2745                 /*
2746                  * Skip a table not found in stat hash, unless we have to force vacuum
2747                  * for anti-wrap purposes.      If it's not acted upon, there's no need to
2748                  * vacuum it.
2749                  */
2750                 *dovacuum = force_vacuum;
2751                 *doanalyze = false;
2752         }
2753
2754         /* ANALYZE refuses to work with pg_statistics */
2755         if (relid == StatisticRelationId)
2756                 *doanalyze = false;
2757 }
2758
2759 /*
2760  * autovacuum_do_vac_analyze
2761  *              Vacuum and/or analyze the specified table
2762  */
2763 static void
2764 autovacuum_do_vac_analyze(autovac_table *tab,
2765                                                   BufferAccessStrategy bstrategy)
2766 {
2767         VacuumStmt      vacstmt;
2768         RangeVar        rangevar;
2769
2770         /* Set up command parameters --- use local variables instead of palloc */
2771         MemSet(&vacstmt, 0, sizeof(vacstmt));
2772         MemSet(&rangevar, 0, sizeof(rangevar));
2773
2774         rangevar.schemaname = tab->at_nspname;
2775         rangevar.relname = tab->at_relname;
2776         rangevar.location = -1;
2777
2778         vacstmt.type = T_VacuumStmt;
2779         if (!tab->at_wraparound)
2780                 vacstmt.options = VACOPT_NOWAIT;
2781         if (tab->at_dovacuum)
2782                 vacstmt.options |= VACOPT_VACUUM;
2783         if (tab->at_doanalyze)
2784                 vacstmt.options |= VACOPT_ANALYZE;
2785         vacstmt.freeze_min_age = tab->at_freeze_min_age;
2786         vacstmt.freeze_table_age = tab->at_freeze_table_age;
2787         vacstmt.multixact_freeze_min_age = tab->at_multixact_freeze_min_age;
2788         vacstmt.multixact_freeze_table_age = tab->at_multixact_freeze_table_age;
2789         /* we pass the OID, but might need this anyway for an error message */
2790         vacstmt.relation = &rangevar;
2791         vacstmt.va_cols = NIL;
2792
2793         /* Let pgstat know what we're doing */
2794         autovac_report_activity(tab);
2795
2796         vacuum(&vacstmt, tab->at_relid, false, bstrategy, tab->at_wraparound, true);
2797 }
2798
2799 /*
2800  * autovac_report_activity
2801  *              Report to pgstat what autovacuum is doing
2802  *
2803  * We send a SQL string corresponding to what the user would see if the
2804  * equivalent command was to be issued manually.
2805  *
2806  * Note we assume that we are going to report the next command as soon as we're
2807  * done with the current one, and exit right after the last one, so we don't
2808  * bother to report "<IDLE>" or some such.
2809  */
2810 static void
2811 autovac_report_activity(autovac_table *tab)
2812 {
2813 #define MAX_AUTOVAC_ACTIV_LEN (NAMEDATALEN * 2 + 56)
2814         char            activity[MAX_AUTOVAC_ACTIV_LEN];
2815         int                     len;
2816
2817         /* Report the command and possible options */
2818         if (tab->at_dovacuum)
2819                 snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
2820                                  "autovacuum: VACUUM%s",
2821                                  tab->at_doanalyze ? " ANALYZE" : "");
2822         else
2823                 snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
2824                                  "autovacuum: ANALYZE");
2825
2826         /*
2827          * Report the qualified name of the relation.
2828          */
2829         len = strlen(activity);
2830
2831         snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
2832                          " %s.%s%s", tab->at_nspname, tab->at_relname,
2833                          tab->at_wraparound ? " (to prevent wraparound)" : "");
2834
2835         /* Set statement_timestamp() to current time for pg_stat_activity */
2836         SetCurrentStatementStartTimestamp();
2837
2838         pgstat_report_activity(STATE_RUNNING, activity);
2839 }
2840
2841 /*
2842  * AutoVacuumingActive
2843  *              Check GUC vars and report whether the autovacuum process should be
2844  *              running.
2845  */
2846 bool
2847 AutoVacuumingActive(void)
2848 {
2849         if (!autovacuum_start_daemon || !pgstat_track_counts)
2850                 return false;
2851         return true;
2852 }
2853
2854 /*
2855  * autovac_init
2856  *              This is called at postmaster initialization.
2857  *
2858  * All we do here is annoy the user if he got it wrong.
2859  */
2860 void
2861 autovac_init(void)
2862 {
2863         if (autovacuum_start_daemon && !pgstat_track_counts)
2864                 ereport(WARNING,
2865                                 (errmsg("autovacuum not started because of misconfiguration"),
2866                                  errhint("Enable the \"track_counts\" option.")));
2867 }
2868
2869 /*
2870  * IsAutoVacuum functions
2871  *              Return whether this is either a launcher autovacuum process or a worker
2872  *              process.
2873  */
2874 bool
2875 IsAutoVacuumLauncherProcess(void)
2876 {
2877         return am_autovacuum_launcher;
2878 }
2879
2880 bool
2881 IsAutoVacuumWorkerProcess(void)
2882 {
2883         return am_autovacuum_worker;
2884 }
2885
2886
2887 /*
2888  * AutoVacuumShmemSize
2889  *              Compute space needed for autovacuum-related shared memory
2890  */
2891 Size
2892 AutoVacuumShmemSize(void)
2893 {
2894         Size            size;
2895
2896         /*
2897          * Need the fixed struct and the array of WorkerInfoData.
2898          */
2899         size = sizeof(AutoVacuumShmemStruct);
2900         size = MAXALIGN(size);
2901         size = add_size(size, mul_size(autovacuum_max_workers,
2902                                                                    sizeof(WorkerInfoData)));
2903         return size;
2904 }
2905
2906 /*
2907  * AutoVacuumShmemInit
2908  *              Allocate and initialize autovacuum-related shared memory
2909  */
2910 void
2911 AutoVacuumShmemInit(void)
2912 {
2913         bool            found;
2914
2915         AutoVacuumShmem = (AutoVacuumShmemStruct *)
2916                 ShmemInitStruct("AutoVacuum Data",
2917                                                 AutoVacuumShmemSize(),
2918                                                 &found);
2919
2920         if (!IsUnderPostmaster)
2921         {
2922                 WorkerInfo      worker;
2923                 int                     i;
2924
2925                 Assert(!found);
2926
2927                 AutoVacuumShmem->av_launcherpid = 0;
2928                 dlist_init(&AutoVacuumShmem->av_freeWorkers);
2929                 dlist_init(&AutoVacuumShmem->av_runningWorkers);
2930                 AutoVacuumShmem->av_startingWorker = NULL;
2931
2932                 worker = (WorkerInfo) ((char *) AutoVacuumShmem +
2933                                                            MAXALIGN(sizeof(AutoVacuumShmemStruct)));
2934
2935                 /* initialize the WorkerInfo free list */
2936                 for (i = 0; i < autovacuum_max_workers; i++)
2937                         dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
2938                                                         &worker[i].wi_links);
2939         }
2940         else
2941                 Assert(found);
2942 }
2943
2944 /*
2945  * autovac_refresh_stats
2946  *              Refresh pgstats data for an autovacuum process
2947  *
2948  * Cause the next pgstats read operation to obtain fresh data, but throttle
2949  * such refreshing in the autovacuum launcher.  This is mostly to avoid
2950  * rereading the pgstats files too many times in quick succession when there
2951  * are many databases.
2952  *
2953  * Note: we avoid throttling in the autovac worker, as it would be
2954  * counterproductive in the recheck logic.
2955  */
2956 static void
2957 autovac_refresh_stats(void)
2958 {
2959         if (IsAutoVacuumLauncherProcess())
2960         {
2961                 static TimestampTz last_read = 0;
2962                 TimestampTz current_time;
2963
2964                 current_time = GetCurrentTimestamp();
2965
2966                 if (!TimestampDifferenceExceeds(last_read, current_time,
2967                                                                                 STATS_READ_DELAY))
2968                         return;
2969
2970                 last_read = current_time;
2971         }
2972
2973         pgstat_clear_snapshot();
2974 }