]> granicus.if.org Git - postgresql/blob - src/backend/postmaster/autovacuum.c
4fa66222d9d52833b9638d14e1f176f8583ddb5f
[postgresql] / src / backend / postmaster / autovacuum.c
1 /*-------------------------------------------------------------------------
2  *
3  * autovacuum.c
4  *
5  * PostgreSQL Integrated Autovacuum Daemon
6  *
7  * The autovacuum system is structured in two different kinds of processes: the
8  * autovacuum launcher and the autovacuum worker.  The launcher is an
9  * always-running process, started by the postmaster when the autovacuum GUC
10  * parameter is set.  The launcher schedules autovacuum workers to be started
11  * when appropriate.  The workers are the processes which execute the actual
12  * vacuuming; they connect to a database as determined in the launcher, and
13  * once connected they examine the catalogs to select the tables to vacuum.
14  *
15  * The autovacuum launcher cannot start the worker processes by itself,
16  * because doing so would cause robustness issues (namely, failure to shut
17  * them down on exceptional conditions, and also, since the launcher is
18  * connected to shared memory and is thus subject to corruption there, it is
19  * not as robust as the postmaster).  So it leaves that task to the postmaster.
20  *
21  * There is an autovacuum shared memory area, where the launcher stores
22  * information about the database it wants vacuumed.  When it wants a new
23  * worker to start, it sets a flag in shared memory and sends a signal to the
24  * postmaster.  Then postmaster knows nothing more than it must start a worker;
25  * so it forks a new child, which turns into a worker.  This new process
26  * connects to shared memory, and there it can inspect the information that the
27  * launcher has set up.
28  *
29  * If the fork() call fails in the postmaster, it sets a flag in the shared
30  * memory area, and sends a signal to the launcher.  The launcher, upon
31  * noticing the flag, can try starting the worker again by resending the
32  * signal.  Note that the failure can only be transient (fork failure due to
33  * high load, memory pressure, too many processes, etc); more permanent
34  * problems, like failure to connect to a database, are detected later in the
35  * worker and dealt with just by having the worker exit normally.  The launcher
36  * will launch a new worker again later, per schedule.
37  *
38  * When the worker is done vacuuming it sends SIGUSR2 to the launcher.  The
39  * launcher then wakes up and is able to launch another worker, if the schedule
40  * is so tight that a new worker is needed immediately.  At this time the
41  * launcher can also balance the settings for the various remaining workers'
42  * cost-based vacuum delay feature.
43  *
44  * Note that there can be more than one worker in a database concurrently.
45  * They will store the table they are currently vacuuming in shared memory, so
46  * that other workers avoid being blocked waiting for the vacuum lock for that
47  * table.  They will also reload the pgstats data just before vacuuming each
48  * table, to avoid vacuuming a table that was just finished being vacuumed by
49  * another worker and thus is no longer noted in shared memory.  However,
50  * there is a window (caused by pgstat delay) on which a worker may choose a
51  * table that was already vacuumed; this is a bug in the current design.
52  *
53  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
54  * Portions Copyright (c) 1994, Regents of the University of California
55  *
56  *
57  * IDENTIFICATION
58  *        src/backend/postmaster/autovacuum.c
59  *
60  *-------------------------------------------------------------------------
61  */
62 #include "postgres.h"
63
64 #include <signal.h>
65 #include <sys/types.h>
66 #include <sys/time.h>
67 #include <unistd.h>
68
69 #include "access/heapam.h"
70 #include "access/htup_details.h"
71 #include "access/multixact.h"
72 #include "access/reloptions.h"
73 #include "access/transam.h"
74 #include "access/xact.h"
75 #include "catalog/dependency.h"
76 #include "catalog/namespace.h"
77 #include "catalog/pg_database.h"
78 #include "commands/dbcommands.h"
79 #include "commands/vacuum.h"
80 #include "lib/ilist.h"
81 #include "libpq/pqsignal.h"
82 #include "miscadmin.h"
83 #include "pgstat.h"
84 #include "postmaster/autovacuum.h"
85 #include "postmaster/fork_process.h"
86 #include "postmaster/postmaster.h"
87 #include "storage/bufmgr.h"
88 #include "storage/ipc.h"
89 #include "storage/latch.h"
90 #include "storage/pmsignal.h"
91 #include "storage/proc.h"
92 #include "storage/procsignal.h"
93 #include "storage/sinvaladt.h"
94 #include "tcop/tcopprot.h"
95 #include "utils/fmgroids.h"
96 #include "utils/lsyscache.h"
97 #include "utils/memutils.h"
98 #include "utils/ps_status.h"
99 #include "utils/rel.h"
100 #include "utils/snapmgr.h"
101 #include "utils/syscache.h"
102 #include "utils/timeout.h"
103 #include "utils/timestamp.h"
104 #include "utils/tqual.h"
105
106
107 /*
108  * GUC parameters
109  */
110 bool            autovacuum_start_daemon = false;
111 int                     autovacuum_max_workers;
112 int                     autovacuum_work_mem = -1;
113 int                     autovacuum_naptime;
114 int                     autovacuum_vac_thresh;
115 double          autovacuum_vac_scale;
116 int                     autovacuum_anl_thresh;
117 double          autovacuum_anl_scale;
118 int                     autovacuum_freeze_max_age;
119 int                     autovacuum_multixact_freeze_max_age;
120
121 int                     autovacuum_vac_cost_delay;
122 int                     autovacuum_vac_cost_limit;
123
124 int                     Log_autovacuum_min_duration = -1;
125
126 /* how long to keep pgstat data in the launcher, in milliseconds */
127 #define STATS_READ_DELAY 1000
128
129 /* the minimum allowed time between two awakenings of the launcher */
130 #define MIN_AUTOVAC_SLEEPTIME 100.0             /* milliseconds */
131 #define MAX_AUTOVAC_SLEEPTIME 300               /* seconds */
132
133 /* Flags to tell if we are in an autovacuum process */
134 static bool am_autovacuum_launcher = false;
135 static bool am_autovacuum_worker = false;
136
137 /* Flags set by signal handlers */
138 static volatile sig_atomic_t got_SIGHUP = false;
139 static volatile sig_atomic_t got_SIGUSR2 = false;
140 static volatile sig_atomic_t got_SIGTERM = false;
141
142 /* Comparison points for determining whether freeze_max_age is exceeded */
143 static TransactionId recentXid;
144 static MultiXactId recentMulti;
145
146 /* Default freeze ages to use for autovacuum (varies by database) */
147 static int      default_freeze_min_age;
148 static int      default_freeze_table_age;
149 static int      default_multixact_freeze_min_age;
150 static int      default_multixact_freeze_table_age;
151
152 /* Memory context for long-lived data */
153 static MemoryContext AutovacMemCxt;
154
155 /* struct to keep track of databases in launcher */
156 typedef struct avl_dbase
157 {
158         Oid                     adl_datid;              /* hash key -- must be first */
159         TimestampTz adl_next_worker;
160         int                     adl_score;
161         dlist_node      adl_node;
162 } avl_dbase;
163
164 /* struct to keep track of databases in worker */
165 typedef struct avw_dbase
166 {
167         Oid                     adw_datid;
168         char       *adw_name;
169         TransactionId adw_frozenxid;
170         MultiXactId adw_minmulti;
171         PgStat_StatDBEntry *adw_entry;
172 } avw_dbase;
173
174 /* struct to keep track of tables to vacuum and/or analyze, in 1st pass */
175 typedef struct av_relation
176 {
177         Oid                     ar_toastrelid;  /* hash key - must be first */
178         Oid                     ar_relid;
179         bool            ar_hasrelopts;
180         AutoVacOpts ar_reloptions;      /* copy of AutoVacOpts from the main table's
181                                                                  * reloptions, or NULL if none */
182 } av_relation;
183
184 /* struct to keep track of tables to vacuum and/or analyze, after rechecking */
185 typedef struct autovac_table
186 {
187         Oid                     at_relid;
188         int                     at_vacoptions;  /* bitmask of VacuumOption */
189         VacuumParams at_params;
190         int                     at_vacuum_cost_delay;
191         int                     at_vacuum_cost_limit;
192         bool            at_dobalance;
193         char       *at_relname;
194         char       *at_nspname;
195         char       *at_datname;
196 } autovac_table;
197
198 /*-------------
199  * This struct holds information about a single worker's whereabouts.  We keep
200  * an array of these in shared memory, sized according to
201  * autovacuum_max_workers.
202  *
203  * wi_links             entry into free list or running list
204  * wi_dboid             OID of the database this worker is supposed to work on
205  * wi_tableoid  OID of the table currently being vacuumed, if any
206  * wi_proc              pointer to PGPROC of the running worker, NULL if not started
207  * wi_launchtime Time at which this worker was launched
208  * wi_cost_*    Vacuum cost-based delay parameters current in this worker
209  *
210  * All fields are protected by AutovacuumLock, except for wi_tableoid which is
211  * protected by AutovacuumScheduleLock (which is read-only for everyone except
212  * that worker itself).
213  *-------------
214  */
215 typedef struct WorkerInfoData
216 {
217         dlist_node      wi_links;
218         Oid                     wi_dboid;
219         Oid                     wi_tableoid;
220         PGPROC     *wi_proc;
221         TimestampTz wi_launchtime;
222         bool            wi_dobalance;
223         int                     wi_cost_delay;
224         int                     wi_cost_limit;
225         int                     wi_cost_limit_base;
226 } WorkerInfoData;
227
228 typedef struct WorkerInfoData *WorkerInfo;
229
230 /*
231  * Possible signals received by the launcher from remote processes.  These are
232  * stored atomically in shared memory so that other processes can set them
233  * without locking.
234  */
235 typedef enum
236 {
237         AutoVacForkFailed,                      /* failed trying to start a worker */
238         AutoVacRebalance,                       /* rebalance the cost limits */
239         AutoVacNumSignals                       /* must be last */
240 }       AutoVacuumSignal;
241
242 /*-------------
243  * The main autovacuum shmem struct.  On shared memory we store this main
244  * struct and the array of WorkerInfo structs.  This struct keeps:
245  *
246  * av_signal            set by other processes to indicate various conditions
247  * av_launcherpid       the PID of the autovacuum launcher
248  * av_freeWorkers       the WorkerInfo freelist
249  * av_runningWorkers the WorkerInfo non-free queue
250  * av_startingWorker pointer to WorkerInfo currently being started (cleared by
251  *                                      the worker itself as soon as it's up and running)
252  *
253  * This struct is protected by AutovacuumLock, except for av_signal and parts
254  * of the worker list (see above).
255  *-------------
256  */
257 typedef struct
258 {
259         sig_atomic_t av_signal[AutoVacNumSignals];
260         pid_t           av_launcherpid;
261         dlist_head      av_freeWorkers;
262         dlist_head      av_runningWorkers;
263         WorkerInfo      av_startingWorker;
264 } AutoVacuumShmemStruct;
265
266 static AutoVacuumShmemStruct *AutoVacuumShmem;
267
268 /*
269  * the database list (of avl_dbase elements) in the launcher, and the context
270  * that contains it
271  */
272 static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList);
273 static MemoryContext DatabaseListCxt = NULL;
274
275 /* Pointer to my own WorkerInfo, valid on each worker */
276 static WorkerInfo MyWorkerInfo = NULL;
277
278 /* PID of launcher, valid only in worker while shutting down */
279 int                     AutovacuumLauncherPid = 0;
280
281 #ifdef EXEC_BACKEND
282 static pid_t avlauncher_forkexec(void);
283 static pid_t avworker_forkexec(void);
284 #endif
285 NON_EXEC_STATIC void AutoVacWorkerMain(int argc, char *argv[]) pg_attribute_noreturn();
286 NON_EXEC_STATIC void AutoVacLauncherMain(int argc, char *argv[]) pg_attribute_noreturn();
287
288 static Oid      do_start_worker(void);
289 static void launcher_determine_sleep(bool canlaunch, bool recursing,
290                                                  struct timeval * nap);
291 static void launch_worker(TimestampTz now);
292 static List *get_database_list(void);
293 static void rebuild_database_list(Oid newdb);
294 static int      db_comparator(const void *a, const void *b);
295 static void autovac_balance_cost(void);
296
297 static void do_autovacuum(void);
298 static void FreeWorkerInfo(int code, Datum arg);
299
300 static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map,
301                                           TupleDesc pg_class_desc,
302                                           int effective_multixact_freeze_max_age);
303 static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts,
304                                                   Form_pg_class classForm,
305                                                   PgStat_StatTabEntry *tabentry,
306                                                   int effective_multixact_freeze_max_age,
307                                                   bool *dovacuum, bool *doanalyze, bool *wraparound);
308
309 static void autovacuum_do_vac_analyze(autovac_table *tab,
310                                                   BufferAccessStrategy bstrategy);
311 static AutoVacOpts *extract_autovac_opts(HeapTuple tup,
312                                          TupleDesc pg_class_desc);
313 static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared,
314                                                   PgStat_StatDBEntry *shared,
315                                                   PgStat_StatDBEntry *dbentry);
316 static void autovac_report_activity(autovac_table *tab);
317 static void av_sighup_handler(SIGNAL_ARGS);
318 static void avl_sigusr2_handler(SIGNAL_ARGS);
319 static void avl_sigterm_handler(SIGNAL_ARGS);
320 static void autovac_refresh_stats(void);
321
322
323
324 /********************************************************************
325  *                                        AUTOVACUUM LAUNCHER CODE
326  ********************************************************************/
327
328 #ifdef EXEC_BACKEND
329 /*
330  * forkexec routine for the autovacuum launcher process.
331  *
332  * Format up the arglist, then fork and exec.
333  */
334 static pid_t
335 avlauncher_forkexec(void)
336 {
337         char       *av[10];
338         int                     ac = 0;
339
340         av[ac++] = "postgres";
341         av[ac++] = "--forkavlauncher";
342         av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
343         av[ac] = NULL;
344
345         Assert(ac < lengthof(av));
346
347         return postmaster_forkexec(ac, av);
348 }
349
350 /*
351  * We need this set from the outside, before InitProcess is called
352  */
353 void
354 AutovacuumLauncherIAm(void)
355 {
356         am_autovacuum_launcher = true;
357 }
358 #endif
359
360 /*
361  * Main entry point for autovacuum launcher process, to be called from the
362  * postmaster.
363  */
364 int
365 StartAutoVacLauncher(void)
366 {
367         pid_t           AutoVacPID;
368
369 #ifdef EXEC_BACKEND
370         switch ((AutoVacPID = avlauncher_forkexec()))
371 #else
372         switch ((AutoVacPID = fork_process()))
373 #endif
374         {
375                 case -1:
376                         ereport(LOG,
377                                  (errmsg("could not fork autovacuum launcher process: %m")));
378                         return 0;
379
380 #ifndef EXEC_BACKEND
381                 case 0:
382                         /* in postmaster child ... */
383                         InitPostmasterChild();
384
385                         /* Close the postmaster's sockets */
386                         ClosePostmasterPorts(false);
387
388                         AutoVacLauncherMain(0, NULL);
389                         break;
390 #endif
391                 default:
392                         return (int) AutoVacPID;
393         }
394
395         /* shouldn't get here */
396         return 0;
397 }
398
399 /*
400  * Main loop for the autovacuum launcher process.
401  */
402 NON_EXEC_STATIC void
403 AutoVacLauncherMain(int argc, char *argv[])
404 {
405         sigjmp_buf      local_sigjmp_buf;
406
407         am_autovacuum_launcher = true;
408
409         /* Identify myself via ps */
410         init_ps_display("autovacuum launcher process", "", "", "");
411
412         ereport(LOG,
413                         (errmsg("autovacuum launcher started")));
414
415         if (PostAuthDelay)
416                 pg_usleep(PostAuthDelay * 1000000L);
417
418         SetProcessingMode(InitProcessing);
419
420         /*
421          * Set up signal handlers.  We operate on databases much like a regular
422          * backend, so we use the same signal handling.  See equivalent code in
423          * tcop/postgres.c.
424          */
425         pqsignal(SIGHUP, av_sighup_handler);
426         pqsignal(SIGINT, StatementCancelHandler);
427         pqsignal(SIGTERM, avl_sigterm_handler);
428
429         pqsignal(SIGQUIT, quickdie);
430         InitializeTimeouts();           /* establishes SIGALRM handler */
431
432         pqsignal(SIGPIPE, SIG_IGN);
433         pqsignal(SIGUSR1, procsignal_sigusr1_handler);
434         pqsignal(SIGUSR2, avl_sigusr2_handler);
435         pqsignal(SIGFPE, FloatExceptionHandler);
436         pqsignal(SIGCHLD, SIG_DFL);
437
438         /* Early initialization */
439         BaseInit();
440
441         /*
442          * Create a per-backend PGPROC struct in shared memory, except in the
443          * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
444          * this before we can use LWLocks (and in the EXEC_BACKEND case we already
445          * had to do some stuff with LWLocks).
446          */
447 #ifndef EXEC_BACKEND
448         InitProcess();
449 #endif
450
451         InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL);
452
453         SetProcessingMode(NormalProcessing);
454
455         /*
456          * Create a memory context that we will do all our work in.  We do this so
457          * that we can reset the context during error recovery and thereby avoid
458          * possible memory leaks.
459          */
460         AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
461                                                                                   "Autovacuum Launcher",
462                                                                                   ALLOCSET_DEFAULT_MINSIZE,
463                                                                                   ALLOCSET_DEFAULT_INITSIZE,
464                                                                                   ALLOCSET_DEFAULT_MAXSIZE);
465         MemoryContextSwitchTo(AutovacMemCxt);
466
467         /*
468          * If an exception is encountered, processing resumes here.
469          *
470          * This code is a stripped down version of PostgresMain error recovery.
471          */
472         if (sigsetjmp(local_sigjmp_buf, 1) != 0)
473         {
474                 /* since not using PG_TRY, must reset error stack by hand */
475                 error_context_stack = NULL;
476
477                 /* Prevents interrupts while cleaning up */
478                 HOLD_INTERRUPTS();
479
480                 /* Forget any pending QueryCancel or timeout request */
481                 disable_all_timeouts(false);
482                 QueryCancelPending = false;             /* second to avoid race condition */
483
484                 /* Report the error to the server log */
485                 EmitErrorReport();
486
487                 /* Abort the current transaction in order to recover */
488                 AbortCurrentTransaction();
489
490                 /*
491                  * Now return to normal top-level context and clear ErrorContext for
492                  * next time.
493                  */
494                 MemoryContextSwitchTo(AutovacMemCxt);
495                 FlushErrorState();
496
497                 /* Flush any leaked data in the top-level context */
498                 MemoryContextResetAndDeleteChildren(AutovacMemCxt);
499
500                 /* don't leave dangling pointers to freed memory */
501                 DatabaseListCxt = NULL;
502                 dlist_init(&DatabaseList);
503
504                 /*
505                  * Make sure pgstat also considers our stat data as gone.  Note: we
506                  * mustn't use autovac_refresh_stats here.
507                  */
508                 pgstat_clear_snapshot();
509
510                 /* Now we can allow interrupts again */
511                 RESUME_INTERRUPTS();
512
513                 /* if in shutdown mode, no need for anything further; just go away */
514                 if (got_SIGTERM)
515                         goto shutdown;
516
517                 /*
518                  * Sleep at least 1 second after any error.  We don't want to be
519                  * filling the error logs as fast as we can.
520                  */
521                 pg_usleep(1000000L);
522         }
523
524         /* We can now handle ereport(ERROR) */
525         PG_exception_stack = &local_sigjmp_buf;
526
527         /* must unblock signals before calling rebuild_database_list */
528         PG_SETMASK(&UnBlockSig);
529
530         /*
531          * Force zero_damaged_pages OFF in the autovac process, even if it is set
532          * in postgresql.conf.  We don't really want such a dangerous option being
533          * applied non-interactively.
534          */
535         SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
536
537         /*
538          * Force statement_timeout and lock_timeout to zero to avoid letting these
539          * settings prevent regular maintenance from being executed.
540          */
541         SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
542         SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
543
544         /*
545          * Force default_transaction_isolation to READ COMMITTED.  We don't want
546          * to pay the overhead of serializable mode, nor add any risk of causing
547          * deadlocks or delaying other transactions.
548          */
549         SetConfigOption("default_transaction_isolation", "read committed",
550                                         PGC_SUSET, PGC_S_OVERRIDE);
551
552         /*
553          * In emergency mode, just start a worker (unless shutdown was requested)
554          * and go away.
555          */
556         if (!AutoVacuumingActive())
557         {
558                 if (!got_SIGTERM)
559                         do_start_worker();
560                 proc_exit(0);                   /* done */
561         }
562
563         AutoVacuumShmem->av_launcherpid = MyProcPid;
564
565         /*
566          * Create the initial database list.  The invariant we want this list to
567          * keep is that it's ordered by decreasing next_time.  As soon as an entry
568          * is updated to a higher time, it will be moved to the front (which is
569          * correct because the only operation is to add autovacuum_naptime to the
570          * entry, and time always increases).
571          */
572         rebuild_database_list(InvalidOid);
573
574         /* loop until shutdown request */
575         while (!got_SIGTERM)
576         {
577                 struct timeval nap;
578                 TimestampTz current_time = 0;
579                 bool            can_launch;
580                 int                     rc;
581
582                 /*
583                  * This loop is a bit different from the normal use of WaitLatch,
584                  * because we'd like to sleep before the first launch of a child
585                  * process.  So it's WaitLatch, then ResetLatch, then check for
586                  * wakening conditions.
587                  */
588
589                 launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers),
590                                                                  false, &nap);
591
592                 /*
593                  * Wait until naptime expires or we get some type of signal (all the
594                  * signal handlers will wake us by calling SetLatch).
595                  */
596                 rc = WaitLatch(MyLatch,
597                                            WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
598                                            (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L));
599
600                 ResetLatch(MyLatch);
601
602                 /* Process sinval catchup interrupts that happened while sleeping */
603                 ProcessCatchupInterrupt();
604
605                 /*
606                  * Emergency bailout if postmaster has died.  This is to avoid the
607                  * necessity for manual cleanup of all postmaster children.
608                  */
609                 if (rc & WL_POSTMASTER_DEATH)
610                         proc_exit(1);
611
612                 /* the normal shutdown case */
613                 if (got_SIGTERM)
614                         break;
615
616                 if (got_SIGHUP)
617                 {
618                         got_SIGHUP = false;
619                         ProcessConfigFile(PGC_SIGHUP);
620
621                         /* shutdown requested in config file? */
622                         if (!AutoVacuumingActive())
623                                 break;
624
625                         /* rebalance in case the default cost parameters changed */
626                         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
627                         autovac_balance_cost();
628                         LWLockRelease(AutovacuumLock);
629
630                         /* rebuild the list in case the naptime changed */
631                         rebuild_database_list(InvalidOid);
632                 }
633
634                 /*
635                  * a worker finished, or postmaster signalled failure to start a
636                  * worker
637                  */
638                 if (got_SIGUSR2)
639                 {
640                         got_SIGUSR2 = false;
641
642                         /* rebalance cost limits, if needed */
643                         if (AutoVacuumShmem->av_signal[AutoVacRebalance])
644                         {
645                                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
646                                 AutoVacuumShmem->av_signal[AutoVacRebalance] = false;
647                                 autovac_balance_cost();
648                                 LWLockRelease(AutovacuumLock);
649                         }
650
651                         if (AutoVacuumShmem->av_signal[AutoVacForkFailed])
652                         {
653                                 /*
654                                  * If the postmaster failed to start a new worker, we sleep
655                                  * for a little while and resend the signal.  The new worker's
656                                  * state is still in memory, so this is sufficient.  After
657                                  * that, we restart the main loop.
658                                  *
659                                  * XXX should we put a limit to the number of times we retry?
660                                  * I don't think it makes much sense, because a future start
661                                  * of a worker will continue to fail in the same way.
662                                  */
663                                 AutoVacuumShmem->av_signal[AutoVacForkFailed] = false;
664                                 pg_usleep(1000000L);    /* 1s */
665                                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
666                                 continue;
667                         }
668                 }
669
670                 /*
671                  * There are some conditions that we need to check before trying to
672                  * start a worker.  First, we need to make sure that there is a
673                  * worker slot available.  Second, we need to make sure that no
674                  * other worker failed while starting up.
675                  */
676
677                 current_time = GetCurrentTimestamp();
678                 LWLockAcquire(AutovacuumLock, LW_SHARED);
679
680                 can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers);
681
682                 if (AutoVacuumShmem->av_startingWorker != NULL)
683                 {
684                         int                     waittime;
685                         WorkerInfo      worker = AutoVacuumShmem->av_startingWorker;
686
687                         /*
688                          * We can't launch another worker when another one is still
689                          * starting up (or failed while doing so), so just sleep for a bit
690                          * more; that worker will wake us up again as soon as it's ready.
691                          * We will only wait autovacuum_naptime seconds (up to a maximum
692                          * of 60 seconds) for this to happen however.  Note that failure
693                          * to connect to a particular database is not a problem here,
694                          * because the worker removes itself from the startingWorker
695                          * pointer before trying to connect.  Problems detected by the
696                          * postmaster (like fork() failure) are also reported and handled
697                          * differently.  The only problems that may cause this code to
698                          * fire are errors in the earlier sections of AutoVacWorkerMain,
699                          * before the worker removes the WorkerInfo from the
700                          * startingWorker pointer.
701                          */
702                         waittime = Min(autovacuum_naptime, 60) * 1000;
703                         if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time,
704                                                                                    waittime))
705                         {
706                                 LWLockRelease(AutovacuumLock);
707                                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
708
709                                 /*
710                                  * No other process can put a worker in starting mode, so if
711                                  * startingWorker is still INVALID after exchanging our lock,
712                                  * we assume it's the same one we saw above (so we don't
713                                  * recheck the launch time).
714                                  */
715                                 if (AutoVacuumShmem->av_startingWorker != NULL)
716                                 {
717                                         worker = AutoVacuumShmem->av_startingWorker;
718                                         worker->wi_dboid = InvalidOid;
719                                         worker->wi_tableoid = InvalidOid;
720                                         worker->wi_proc = NULL;
721                                         worker->wi_launchtime = 0;
722                                         dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
723                                                                         &worker->wi_links);
724                                         AutoVacuumShmem->av_startingWorker = NULL;
725                                         elog(WARNING, "worker took too long to start; canceled");
726                                 }
727                         }
728                         else
729                                 can_launch = false;
730                 }
731                 LWLockRelease(AutovacuumLock);  /* either shared or exclusive */
732
733                 /* if we can't do anything, just go back to sleep */
734                 if (!can_launch)
735                         continue;
736
737                 /* We're OK to start a new worker */
738
739                 if (dlist_is_empty(&DatabaseList))
740                 {
741                         /*
742                          * Special case when the list is empty: start a worker right away.
743                          * This covers the initial case, when no database is in pgstats
744                          * (thus the list is empty).  Note that the constraints in
745                          * launcher_determine_sleep keep us from starting workers too
746                          * quickly (at most once every autovacuum_naptime when the list is
747                          * empty).
748                          */
749                         launch_worker(current_time);
750                 }
751                 else
752                 {
753                         /*
754                          * because rebuild_database_list constructs a list with most
755                          * distant adl_next_worker first, we obtain our database from the
756                          * tail of the list.
757                          */
758                         avl_dbase  *avdb;
759
760                         avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
761
762                         /*
763                          * launch a worker if next_worker is right now or it is in the
764                          * past
765                          */
766                         if (TimestampDifferenceExceeds(avdb->adl_next_worker,
767                                                                                    current_time, 0))
768                                 launch_worker(current_time);
769                 }
770         }
771
772         /* Normal exit from the autovac launcher is here */
773 shutdown:
774         ereport(LOG,
775                         (errmsg("autovacuum launcher shutting down")));
776         AutoVacuumShmem->av_launcherpid = 0;
777
778         proc_exit(0);                           /* done */
779 }
780
781 /*
782  * Determine the time to sleep, based on the database list.
783  *
784  * The "canlaunch" parameter indicates whether we can start a worker right now,
785  * for example due to the workers being all busy.  If this is false, we will
786  * cause a long sleep, which will be interrupted when a worker exits.
787  */
788 static void
789 launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval * nap)
790 {
791         /*
792          * We sleep until the next scheduled vacuum.  We trust that when the
793          * database list was built, care was taken so that no entries have times
794          * in the past; if the first entry has too close a next_worker value, or a
795          * time in the past, we will sleep a small nominal time.
796          */
797         if (!canlaunch)
798         {
799                 nap->tv_sec = autovacuum_naptime;
800                 nap->tv_usec = 0;
801         }
802         else if (!dlist_is_empty(&DatabaseList))
803         {
804                 TimestampTz current_time = GetCurrentTimestamp();
805                 TimestampTz next_wakeup;
806                 avl_dbase  *avdb;
807                 long            secs;
808                 int                     usecs;
809
810                 avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
811
812                 next_wakeup = avdb->adl_next_worker;
813                 TimestampDifference(current_time, next_wakeup, &secs, &usecs);
814
815                 nap->tv_sec = secs;
816                 nap->tv_usec = usecs;
817         }
818         else
819         {
820                 /* list is empty, sleep for whole autovacuum_naptime seconds  */
821                 nap->tv_sec = autovacuum_naptime;
822                 nap->tv_usec = 0;
823         }
824
825         /*
826          * If the result is exactly zero, it means a database had an entry with
827          * time in the past.  Rebuild the list so that the databases are evenly
828          * distributed again, and recalculate the time to sleep.  This can happen
829          * if there are more tables needing vacuum than workers, and they all take
830          * longer to vacuum than autovacuum_naptime.
831          *
832          * We only recurse once.  rebuild_database_list should always return times
833          * in the future, but it seems best not to trust too much on that.
834          */
835         if (nap->tv_sec == 0 && nap->tv_usec == 0 && !recursing)
836         {
837                 rebuild_database_list(InvalidOid);
838                 launcher_determine_sleep(canlaunch, true, nap);
839                 return;
840         }
841
842         /* The smallest time we'll allow the launcher to sleep. */
843         if (nap->tv_sec <= 0 && nap->tv_usec <= MIN_AUTOVAC_SLEEPTIME * 1000)
844         {
845                 nap->tv_sec = 0;
846                 nap->tv_usec = MIN_AUTOVAC_SLEEPTIME * 1000;
847         }
848
849         /*
850          * If the sleep time is too large, clamp it to an arbitrary maximum (plus
851          * any fractional seconds, for simplicity).  This avoids an essentially
852          * infinite sleep in strange cases like the system clock going backwards a
853          * few years.
854          */
855         if (nap->tv_sec > MAX_AUTOVAC_SLEEPTIME)
856                 nap->tv_sec = MAX_AUTOVAC_SLEEPTIME;
857 }
858
859 /*
860  * Build an updated DatabaseList.  It must only contain databases that appear
861  * in pgstats, and must be sorted by next_worker from highest to lowest,
862  * distributed regularly across the next autovacuum_naptime interval.
863  *
864  * Receives the Oid of the database that made this list be generated (we call
865  * this the "new" database, because when the database was already present on
866  * the list, we expect that this function is not called at all).  The
867  * preexisting list, if any, will be used to preserve the order of the
868  * databases in the autovacuum_naptime period.  The new database is put at the
869  * end of the interval.  The actual values are not saved, which should not be
870  * much of a problem.
871  */
872 static void
873 rebuild_database_list(Oid newdb)
874 {
875         List       *dblist;
876         ListCell   *cell;
877         MemoryContext newcxt;
878         MemoryContext oldcxt;
879         MemoryContext tmpcxt;
880         HASHCTL         hctl;
881         int                     score;
882         int                     nelems;
883         HTAB       *dbhash;
884         dlist_iter      iter;
885
886         /* use fresh stats */
887         autovac_refresh_stats();
888
889         newcxt = AllocSetContextCreate(AutovacMemCxt,
890                                                                    "AV dblist",
891                                                                    ALLOCSET_DEFAULT_MINSIZE,
892                                                                    ALLOCSET_DEFAULT_INITSIZE,
893                                                                    ALLOCSET_DEFAULT_MAXSIZE);
894         tmpcxt = AllocSetContextCreate(newcxt,
895                                                                    "tmp AV dblist",
896                                                                    ALLOCSET_DEFAULT_MINSIZE,
897                                                                    ALLOCSET_DEFAULT_INITSIZE,
898                                                                    ALLOCSET_DEFAULT_MAXSIZE);
899         oldcxt = MemoryContextSwitchTo(tmpcxt);
900
901         /*
902          * Implementing this is not as simple as it sounds, because we need to put
903          * the new database at the end of the list; next the databases that were
904          * already on the list, and finally (at the tail of the list) all the
905          * other databases that are not on the existing list.
906          *
907          * To do this, we build an empty hash table of scored databases.  We will
908          * start with the lowest score (zero) for the new database, then
909          * increasing scores for the databases in the existing list, in order, and
910          * lastly increasing scores for all databases gotten via
911          * get_database_list() that are not already on the hash.
912          *
913          * Then we will put all the hash elements into an array, sort the array by
914          * score, and finally put the array elements into the new doubly linked
915          * list.
916          */
917         hctl.keysize = sizeof(Oid);
918         hctl.entrysize = sizeof(avl_dbase);
919         hctl.hcxt = tmpcxt;
920         dbhash = hash_create("db hash", 20, &hctl,      /* magic number here FIXME */
921                                                  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
922
923         /* start by inserting the new database */
924         score = 0;
925         if (OidIsValid(newdb))
926         {
927                 avl_dbase  *db;
928                 PgStat_StatDBEntry *entry;
929
930                 /* only consider this database if it has a pgstat entry */
931                 entry = pgstat_fetch_stat_dbentry(newdb);
932                 if (entry != NULL)
933                 {
934                         /* we assume it isn't found because the hash was just created */
935                         db = hash_search(dbhash, &newdb, HASH_ENTER, NULL);
936
937                         /* hash_search already filled in the key */
938                         db->adl_score = score++;
939                         /* next_worker is filled in later */
940                 }
941         }
942
943         /* Now insert the databases from the existing list */
944         dlist_foreach(iter, &DatabaseList)
945         {
946                 avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
947                 avl_dbase  *db;
948                 bool            found;
949                 PgStat_StatDBEntry *entry;
950
951                 /*
952                  * skip databases with no stat entries -- in particular, this gets rid
953                  * of dropped databases
954                  */
955                 entry = pgstat_fetch_stat_dbentry(avdb->adl_datid);
956                 if (entry == NULL)
957                         continue;
958
959                 db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found);
960
961                 if (!found)
962                 {
963                         /* hash_search already filled in the key */
964                         db->adl_score = score++;
965                         /* next_worker is filled in later */
966                 }
967         }
968
969         /* finally, insert all qualifying databases not previously inserted */
970         dblist = get_database_list();
971         foreach(cell, dblist)
972         {
973                 avw_dbase  *avdb = lfirst(cell);
974                 avl_dbase  *db;
975                 bool            found;
976                 PgStat_StatDBEntry *entry;
977
978                 /* only consider databases with a pgstat entry */
979                 entry = pgstat_fetch_stat_dbentry(avdb->adw_datid);
980                 if (entry == NULL)
981                         continue;
982
983                 db = hash_search(dbhash, &(avdb->adw_datid), HASH_ENTER, &found);
984                 /* only update the score if the database was not already on the hash */
985                 if (!found)
986                 {
987                         /* hash_search already filled in the key */
988                         db->adl_score = score++;
989                         /* next_worker is filled in later */
990                 }
991         }
992         nelems = score;
993
994         /* from here on, the allocated memory belongs to the new list */
995         MemoryContextSwitchTo(newcxt);
996         dlist_init(&DatabaseList);
997
998         if (nelems > 0)
999         {
1000                 TimestampTz current_time;
1001                 int                     millis_increment;
1002                 avl_dbase  *dbary;
1003                 avl_dbase  *db;
1004                 HASH_SEQ_STATUS seq;
1005                 int                     i;
1006
1007                 /* put all the hash elements into an array */
1008                 dbary = palloc(nelems * sizeof(avl_dbase));
1009
1010                 i = 0;
1011                 hash_seq_init(&seq, dbhash);
1012                 while ((db = hash_seq_search(&seq)) != NULL)
1013                         memcpy(&(dbary[i++]), db, sizeof(avl_dbase));
1014
1015                 /* sort the array */
1016                 qsort(dbary, nelems, sizeof(avl_dbase), db_comparator);
1017
1018                 /*
1019                  * Determine the time interval between databases in the schedule. If
1020                  * we see that the configured naptime would take us to sleep times
1021                  * lower than our min sleep time (which launcher_determine_sleep is
1022                  * coded not to allow), silently use a larger naptime (but don't touch
1023                  * the GUC variable).
1024                  */
1025                 millis_increment = 1000.0 * autovacuum_naptime / nelems;
1026                 if (millis_increment <= MIN_AUTOVAC_SLEEPTIME)
1027                         millis_increment = MIN_AUTOVAC_SLEEPTIME * 1.1;
1028
1029                 current_time = GetCurrentTimestamp();
1030
1031                 /*
1032                  * move the elements from the array into the dllist, setting the
1033                  * next_worker while walking the array
1034                  */
1035                 for (i = 0; i < nelems; i++)
1036                 {
1037                         avl_dbase  *db = &(dbary[i]);
1038
1039                         current_time = TimestampTzPlusMilliseconds(current_time,
1040                                                                                                            millis_increment);
1041                         db->adl_next_worker = current_time;
1042
1043                         /* later elements should go closer to the head of the list */
1044                         dlist_push_head(&DatabaseList, &db->adl_node);
1045                 }
1046         }
1047
1048         /* all done, clean up memory */
1049         if (DatabaseListCxt != NULL)
1050                 MemoryContextDelete(DatabaseListCxt);
1051         MemoryContextDelete(tmpcxt);
1052         DatabaseListCxt = newcxt;
1053         MemoryContextSwitchTo(oldcxt);
1054 }
1055
1056 /* qsort comparator for avl_dbase, using adl_score */
1057 static int
1058 db_comparator(const void *a, const void *b)
1059 {
1060         if (((const avl_dbase *) a)->adl_score == ((const avl_dbase *) b)->adl_score)
1061                 return 0;
1062         else
1063                 return (((const avl_dbase *) a)->adl_score < ((const avl_dbase *) b)->adl_score) ? 1 : -1;
1064 }
1065
1066 /*
1067  * do_start_worker
1068  *
1069  * Bare-bones procedure for starting an autovacuum worker from the launcher.
1070  * It determines what database to work on, sets up shared memory stuff and
1071  * signals postmaster to start the worker.  It fails gracefully if invoked when
1072  * autovacuum_workers are already active.
1073  *
1074  * Return value is the OID of the database that the worker is going to process,
1075  * or InvalidOid if no worker was actually started.
1076  */
1077 static Oid
1078 do_start_worker(void)
1079 {
1080         List       *dblist;
1081         ListCell   *cell;
1082         TransactionId xidForceLimit;
1083         MultiXactId multiForceLimit;
1084         bool            for_xid_wrap;
1085         bool            for_multi_wrap;
1086         avw_dbase  *avdb;
1087         TimestampTz current_time;
1088         bool            skipit = false;
1089         Oid                     retval = InvalidOid;
1090         MemoryContext tmpcxt,
1091                                 oldcxt;
1092
1093         /* return quickly when there are no free workers */
1094         LWLockAcquire(AutovacuumLock, LW_SHARED);
1095         if (dlist_is_empty(&AutoVacuumShmem->av_freeWorkers))
1096         {
1097                 LWLockRelease(AutovacuumLock);
1098                 return InvalidOid;
1099         }
1100         LWLockRelease(AutovacuumLock);
1101
1102         /*
1103          * Create and switch to a temporary context to avoid leaking the memory
1104          * allocated for the database list.
1105          */
1106         tmpcxt = AllocSetContextCreate(CurrentMemoryContext,
1107                                                                    "Start worker tmp cxt",
1108                                                                    ALLOCSET_DEFAULT_MINSIZE,
1109                                                                    ALLOCSET_DEFAULT_INITSIZE,
1110                                                                    ALLOCSET_DEFAULT_MAXSIZE);
1111         oldcxt = MemoryContextSwitchTo(tmpcxt);
1112
1113         /* use fresh stats */
1114         autovac_refresh_stats();
1115
1116         /* Get a list of databases */
1117         dblist = get_database_list();
1118
1119         /*
1120          * Determine the oldest datfrozenxid/relfrozenxid that we will allow to
1121          * pass without forcing a vacuum.  (This limit can be tightened for
1122          * particular tables, but not loosened.)
1123          */
1124         recentXid = ReadNewTransactionId();
1125         xidForceLimit = recentXid - autovacuum_freeze_max_age;
1126         /* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */
1127         /* this can cause the limit to go backwards by 3, but that's OK */
1128         if (xidForceLimit < FirstNormalTransactionId)
1129                 xidForceLimit -= FirstNormalTransactionId;
1130
1131         /* Also determine the oldest datminmxid we will consider. */
1132         recentMulti = ReadNextMultiXactId();
1133         multiForceLimit = recentMulti - MultiXactMemberFreezeThreshold();
1134         if (multiForceLimit < FirstMultiXactId)
1135                 multiForceLimit -= FirstMultiXactId;
1136
1137         /*
1138          * Choose a database to connect to.  We pick the database that was least
1139          * recently auto-vacuumed, or one that needs vacuuming to prevent Xid
1140          * wraparound-related data loss.  If any db at risk of Xid wraparound is
1141          * found, we pick the one with oldest datfrozenxid, independently of
1142          * autovacuum times; similarly we pick the one with the oldest datminmxid
1143          * if any is in MultiXactId wraparound.  Note that those in Xid wraparound
1144          * danger are given more priority than those in multi wraparound danger.
1145          *
1146          * Note that a database with no stats entry is not considered, except for
1147          * Xid wraparound purposes.  The theory is that if no one has ever
1148          * connected to it since the stats were last initialized, it doesn't need
1149          * vacuuming.
1150          *
1151          * XXX This could be improved if we had more info about whether it needs
1152          * vacuuming before connecting to it.  Perhaps look through the pgstats
1153          * data for the database's tables?  One idea is to keep track of the
1154          * number of new and dead tuples per database in pgstats.  However it
1155          * isn't clear how to construct a metric that measures that and not cause
1156          * starvation for less busy databases.
1157          */
1158         avdb = NULL;
1159         for_xid_wrap = false;
1160         for_multi_wrap = false;
1161         current_time = GetCurrentTimestamp();
1162         foreach(cell, dblist)
1163         {
1164                 avw_dbase  *tmp = lfirst(cell);
1165                 dlist_iter      iter;
1166
1167                 /* Check to see if this one is at risk of wraparound */
1168                 if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit))
1169                 {
1170                         if (avdb == NULL ||
1171                                 TransactionIdPrecedes(tmp->adw_frozenxid,
1172                                                                           avdb->adw_frozenxid))
1173                                 avdb = tmp;
1174                         for_xid_wrap = true;
1175                         continue;
1176                 }
1177                 else if (for_xid_wrap)
1178                         continue;                       /* ignore not-at-risk DBs */
1179                 else if (MultiXactIdPrecedes(tmp->adw_minmulti, multiForceLimit))
1180                 {
1181                         if (avdb == NULL ||
1182                                 MultiXactIdPrecedes(tmp->adw_minmulti, avdb->adw_minmulti))
1183                                 avdb = tmp;
1184                         for_multi_wrap = true;
1185                         continue;
1186                 }
1187                 else if (for_multi_wrap)
1188                         continue;                       /* ignore not-at-risk DBs */
1189
1190                 /* Find pgstat entry if any */
1191                 tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid);
1192
1193                 /*
1194                  * Skip a database with no pgstat entry; it means it hasn't seen any
1195                  * activity.
1196                  */
1197                 if (!tmp->adw_entry)
1198                         continue;
1199
1200                 /*
1201                  * Also, skip a database that appears on the database list as having
1202                  * been processed recently (less than autovacuum_naptime seconds ago).
1203                  * We do this so that we don't select a database which we just
1204                  * selected, but that pgstat hasn't gotten around to updating the last
1205                  * autovacuum time yet.
1206                  */
1207                 skipit = false;
1208
1209                 dlist_reverse_foreach(iter, &DatabaseList)
1210                 {
1211                         avl_dbase  *dbp = dlist_container(avl_dbase, adl_node, iter.cur);
1212
1213                         if (dbp->adl_datid == tmp->adw_datid)
1214                         {
1215                                 /*
1216                                  * Skip this database if its next_worker value falls between
1217                                  * the current time and the current time plus naptime.
1218                                  */
1219                                 if (!TimestampDifferenceExceeds(dbp->adl_next_worker,
1220                                                                                                 current_time, 0) &&
1221                                         !TimestampDifferenceExceeds(current_time,
1222                                                                                                 dbp->adl_next_worker,
1223                                                                                                 autovacuum_naptime * 1000))
1224                                         skipit = true;
1225
1226                                 break;
1227                         }
1228                 }
1229                 if (skipit)
1230                         continue;
1231
1232                 /*
1233                  * Remember the db with oldest autovac time.  (If we are here, both
1234                  * tmp->entry and db->entry must be non-null.)
1235                  */
1236                 if (avdb == NULL ||
1237                         tmp->adw_entry->last_autovac_time < avdb->adw_entry->last_autovac_time)
1238                         avdb = tmp;
1239         }
1240
1241         /* Found a database -- process it */
1242         if (avdb != NULL)
1243         {
1244                 WorkerInfo      worker;
1245                 dlist_node *wptr;
1246
1247                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1248
1249                 /*
1250                  * Get a worker entry from the freelist.  We checked above, so there
1251                  * really should be a free slot.
1252                  */
1253                 wptr = dlist_pop_head_node(&AutoVacuumShmem->av_freeWorkers);
1254
1255                 worker = dlist_container(WorkerInfoData, wi_links, wptr);
1256                 worker->wi_dboid = avdb->adw_datid;
1257                 worker->wi_proc = NULL;
1258                 worker->wi_launchtime = GetCurrentTimestamp();
1259
1260                 AutoVacuumShmem->av_startingWorker = worker;
1261
1262                 LWLockRelease(AutovacuumLock);
1263
1264                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
1265
1266                 retval = avdb->adw_datid;
1267         }
1268         else if (skipit)
1269         {
1270                 /*
1271                  * If we skipped all databases on the list, rebuild it, because it
1272                  * probably contains a dropped database.
1273                  */
1274                 rebuild_database_list(InvalidOid);
1275         }
1276
1277         MemoryContextSwitchTo(oldcxt);
1278         MemoryContextDelete(tmpcxt);
1279
1280         return retval;
1281 }
1282
1283 /*
1284  * launch_worker
1285  *
1286  * Wrapper for starting a worker from the launcher.  Besides actually starting
1287  * it, update the database list to reflect the next time that another one will
1288  * need to be started on the selected database.  The actual database choice is
1289  * left to do_start_worker.
1290  *
1291  * This routine is also expected to insert an entry into the database list if
1292  * the selected database was previously absent from the list.
1293  */
1294 static void
1295 launch_worker(TimestampTz now)
1296 {
1297         Oid                     dbid;
1298         dlist_iter      iter;
1299
1300         dbid = do_start_worker();
1301         if (OidIsValid(dbid))
1302         {
1303                 bool            found = false;
1304
1305                 /*
1306                  * Walk the database list and update the corresponding entry.  If the
1307                  * database is not on the list, we'll recreate the list.
1308                  */
1309                 dlist_foreach(iter, &DatabaseList)
1310                 {
1311                         avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
1312
1313                         if (avdb->adl_datid == dbid)
1314                         {
1315                                 found = true;
1316
1317                                 /*
1318                                  * add autovacuum_naptime seconds to the current time, and use
1319                                  * that as the new "next_worker" field for this database.
1320                                  */
1321                                 avdb->adl_next_worker =
1322                                         TimestampTzPlusMilliseconds(now, autovacuum_naptime * 1000);
1323
1324                                 dlist_move_head(&DatabaseList, iter.cur);
1325                                 break;
1326                         }
1327                 }
1328
1329                 /*
1330                  * If the database was not present in the database list, we rebuild
1331                  * the list.  It's possible that the database does not get into the
1332                  * list anyway, for example if it's a database that doesn't have a
1333                  * pgstat entry, but this is not a problem because we don't want to
1334                  * schedule workers regularly into those in any case.
1335                  */
1336                 if (!found)
1337                         rebuild_database_list(dbid);
1338         }
1339 }
1340
1341 /*
1342  * Called from postmaster to signal a failure to fork a process to become
1343  * worker.  The postmaster should kill(SIGUSR2) the launcher shortly
1344  * after calling this function.
1345  */
1346 void
1347 AutoVacWorkerFailed(void)
1348 {
1349         AutoVacuumShmem->av_signal[AutoVacForkFailed] = true;
1350 }
1351
1352 /* SIGHUP: set flag to re-read config file at next convenient time */
1353 static void
1354 av_sighup_handler(SIGNAL_ARGS)
1355 {
1356         int                     save_errno = errno;
1357
1358         got_SIGHUP = true;
1359         SetLatch(MyLatch);
1360
1361         errno = save_errno;
1362 }
1363
1364 /* SIGUSR2: a worker is up and running, or just finished, or failed to fork */
1365 static void
1366 avl_sigusr2_handler(SIGNAL_ARGS)
1367 {
1368         int                     save_errno = errno;
1369
1370         got_SIGUSR2 = true;
1371         SetLatch(MyLatch);
1372
1373         errno = save_errno;
1374 }
1375
1376 /* SIGTERM: time to die */
1377 static void
1378 avl_sigterm_handler(SIGNAL_ARGS)
1379 {
1380         int                     save_errno = errno;
1381
1382         got_SIGTERM = true;
1383         SetLatch(MyLatch);
1384
1385         errno = save_errno;
1386 }
1387
1388
1389 /********************************************************************
1390  *                                        AUTOVACUUM WORKER CODE
1391  ********************************************************************/
1392
1393 #ifdef EXEC_BACKEND
1394 /*
1395  * forkexec routines for the autovacuum worker.
1396  *
1397  * Format up the arglist, then fork and exec.
1398  */
1399 static pid_t
1400 avworker_forkexec(void)
1401 {
1402         char       *av[10];
1403         int                     ac = 0;
1404
1405         av[ac++] = "postgres";
1406         av[ac++] = "--forkavworker";
1407         av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
1408         av[ac] = NULL;
1409
1410         Assert(ac < lengthof(av));
1411
1412         return postmaster_forkexec(ac, av);
1413 }
1414
1415 /*
1416  * We need this set from the outside, before InitProcess is called
1417  */
1418 void
1419 AutovacuumWorkerIAm(void)
1420 {
1421         am_autovacuum_worker = true;
1422 }
1423 #endif
1424
1425 /*
1426  * Main entry point for autovacuum worker process.
1427  *
1428  * This code is heavily based on pgarch.c, q.v.
1429  */
1430 int
1431 StartAutoVacWorker(void)
1432 {
1433         pid_t           worker_pid;
1434
1435 #ifdef EXEC_BACKEND
1436         switch ((worker_pid = avworker_forkexec()))
1437 #else
1438         switch ((worker_pid = fork_process()))
1439 #endif
1440         {
1441                 case -1:
1442                         ereport(LOG,
1443                                         (errmsg("could not fork autovacuum worker process: %m")));
1444                         return 0;
1445
1446 #ifndef EXEC_BACKEND
1447                 case 0:
1448                         /* in postmaster child ... */
1449                         InitPostmasterChild();
1450
1451                         /* Close the postmaster's sockets */
1452                         ClosePostmasterPorts(false);
1453
1454                         AutoVacWorkerMain(0, NULL);
1455                         break;
1456 #endif
1457                 default:
1458                         return (int) worker_pid;
1459         }
1460
1461         /* shouldn't get here */
1462         return 0;
1463 }
1464
1465 /*
1466  * AutoVacWorkerMain
1467  */
1468 NON_EXEC_STATIC void
1469 AutoVacWorkerMain(int argc, char *argv[])
1470 {
1471         sigjmp_buf      local_sigjmp_buf;
1472         Oid                     dbid;
1473
1474         am_autovacuum_worker = true;
1475
1476         /* Identify myself via ps */
1477         init_ps_display("autovacuum worker process", "", "", "");
1478
1479         SetProcessingMode(InitProcessing);
1480
1481         /*
1482          * Set up signal handlers.  We operate on databases much like a regular
1483          * backend, so we use the same signal handling.  See equivalent code in
1484          * tcop/postgres.c.
1485          */
1486         pqsignal(SIGHUP, av_sighup_handler);
1487
1488         /*
1489          * SIGINT is used to signal canceling the current table's vacuum; SIGTERM
1490          * means abort and exit cleanly, and SIGQUIT means abandon ship.
1491          */
1492         pqsignal(SIGINT, StatementCancelHandler);
1493         pqsignal(SIGTERM, die);
1494         pqsignal(SIGQUIT, quickdie);
1495         InitializeTimeouts();           /* establishes SIGALRM handler */
1496
1497         pqsignal(SIGPIPE, SIG_IGN);
1498         pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1499         pqsignal(SIGUSR2, SIG_IGN);
1500         pqsignal(SIGFPE, FloatExceptionHandler);
1501         pqsignal(SIGCHLD, SIG_DFL);
1502
1503         /* Early initialization */
1504         BaseInit();
1505
1506         /*
1507          * Create a per-backend PGPROC struct in shared memory, except in the
1508          * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
1509          * this before we can use LWLocks (and in the EXEC_BACKEND case we already
1510          * had to do some stuff with LWLocks).
1511          */
1512 #ifndef EXEC_BACKEND
1513         InitProcess();
1514 #endif
1515
1516         /*
1517          * If an exception is encountered, processing resumes here.
1518          *
1519          * See notes in postgres.c about the design of this coding.
1520          */
1521         if (sigsetjmp(local_sigjmp_buf, 1) != 0)
1522         {
1523                 /* Prevents interrupts while cleaning up */
1524                 HOLD_INTERRUPTS();
1525
1526                 /* Report the error to the server log */
1527                 EmitErrorReport();
1528
1529                 /*
1530                  * We can now go away.  Note that because we called InitProcess, a
1531                  * callback was registered to do ProcKill, which will clean up
1532                  * necessary state.
1533                  */
1534                 proc_exit(0);
1535         }
1536
1537         /* We can now handle ereport(ERROR) */
1538         PG_exception_stack = &local_sigjmp_buf;
1539
1540         PG_SETMASK(&UnBlockSig);
1541
1542         /*
1543          * Force zero_damaged_pages OFF in the autovac process, even if it is set
1544          * in postgresql.conf.  We don't really want such a dangerous option being
1545          * applied non-interactively.
1546          */
1547         SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
1548
1549         /*
1550          * Force statement_timeout and lock_timeout to zero to avoid letting these
1551          * settings prevent regular maintenance from being executed.
1552          */
1553         SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1554         SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1555
1556         /*
1557          * Force default_transaction_isolation to READ COMMITTED.  We don't want
1558          * to pay the overhead of serializable mode, nor add any risk of causing
1559          * deadlocks or delaying other transactions.
1560          */
1561         SetConfigOption("default_transaction_isolation", "read committed",
1562                                         PGC_SUSET, PGC_S_OVERRIDE);
1563
1564         /*
1565          * Force synchronous replication off to allow regular maintenance even if
1566          * we are waiting for standbys to connect. This is important to ensure we
1567          * aren't blocked from performing anti-wraparound tasks.
1568          */
1569         if (synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH)
1570                 SetConfigOption("synchronous_commit", "local",
1571                                                 PGC_SUSET, PGC_S_OVERRIDE);
1572
1573         /*
1574          * Get the info about the database we're going to work on.
1575          */
1576         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1577
1578         /*
1579          * beware of startingWorker being INVALID; this should normally not
1580          * happen, but if a worker fails after forking and before this, the
1581          * launcher might have decided to remove it from the queue and start
1582          * again.
1583          */
1584         if (AutoVacuumShmem->av_startingWorker != NULL)
1585         {
1586                 MyWorkerInfo = AutoVacuumShmem->av_startingWorker;
1587                 dbid = MyWorkerInfo->wi_dboid;
1588                 MyWorkerInfo->wi_proc = MyProc;
1589
1590                 /* insert into the running list */
1591                 dlist_push_head(&AutoVacuumShmem->av_runningWorkers,
1592                                                 &MyWorkerInfo->wi_links);
1593
1594                 /*
1595                  * remove from the "starting" pointer, so that the launcher can start
1596                  * a new worker if required
1597                  */
1598                 AutoVacuumShmem->av_startingWorker = NULL;
1599                 LWLockRelease(AutovacuumLock);
1600
1601                 on_shmem_exit(FreeWorkerInfo, 0);
1602
1603                 /* wake up the launcher */
1604                 if (AutoVacuumShmem->av_launcherpid != 0)
1605                         kill(AutoVacuumShmem->av_launcherpid, SIGUSR2);
1606         }
1607         else
1608         {
1609                 /* no worker entry for me, go away */
1610                 elog(WARNING, "autovacuum worker started without a worker entry");
1611                 dbid = InvalidOid;
1612                 LWLockRelease(AutovacuumLock);
1613         }
1614
1615         if (OidIsValid(dbid))
1616         {
1617                 char            dbname[NAMEDATALEN];
1618
1619                 /*
1620                  * Report autovac startup to the stats collector.  We deliberately do
1621                  * this before InitPostgres, so that the last_autovac_time will get
1622                  * updated even if the connection attempt fails.  This is to prevent
1623                  * autovac from getting "stuck" repeatedly selecting an unopenable
1624                  * database, rather than making any progress on stuff it can connect
1625                  * to.
1626                  */
1627                 pgstat_report_autovac(dbid);
1628
1629                 /*
1630                  * Connect to the selected database
1631                  *
1632                  * Note: if we have selected a just-deleted database (due to using
1633                  * stale stats info), we'll fail and exit here.
1634                  */
1635                 InitPostgres(NULL, dbid, NULL, InvalidOid, dbname);
1636                 SetProcessingMode(NormalProcessing);
1637                 set_ps_display(dbname, false);
1638                 ereport(DEBUG1,
1639                                 (errmsg("autovacuum: processing database \"%s\"", dbname)));
1640
1641                 if (PostAuthDelay)
1642                         pg_usleep(PostAuthDelay * 1000000L);
1643
1644                 /* And do an appropriate amount of work */
1645                 recentXid = ReadNewTransactionId();
1646                 recentMulti = ReadNextMultiXactId();
1647                 do_autovacuum();
1648         }
1649
1650         /*
1651          * The launcher will be notified of my death in ProcKill, *if* we managed
1652          * to get a worker slot at all
1653          */
1654
1655         /* All done, go away */
1656         proc_exit(0);
1657 }
1658
1659 /*
1660  * Return a WorkerInfo to the free list
1661  */
1662 static void
1663 FreeWorkerInfo(int code, Datum arg)
1664 {
1665         if (MyWorkerInfo != NULL)
1666         {
1667                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1668
1669                 /*
1670                  * Wake the launcher up so that he can launch a new worker immediately
1671                  * if required.  We only save the launcher's PID in local memory here;
1672                  * the actual signal will be sent when the PGPROC is recycled.  Note
1673                  * that we always do this, so that the launcher can rebalance the cost
1674                  * limit setting of the remaining workers.
1675                  *
1676                  * We somewhat ignore the risk that the launcher changes its PID
1677                  * between us reading it and the actual kill; we expect ProcKill to be
1678                  * called shortly after us, and we assume that PIDs are not reused too
1679                  * quickly after a process exits.
1680                  */
1681                 AutovacuumLauncherPid = AutoVacuumShmem->av_launcherpid;
1682
1683                 dlist_delete(&MyWorkerInfo->wi_links);
1684                 MyWorkerInfo->wi_dboid = InvalidOid;
1685                 MyWorkerInfo->wi_tableoid = InvalidOid;
1686                 MyWorkerInfo->wi_proc = NULL;
1687                 MyWorkerInfo->wi_launchtime = 0;
1688                 MyWorkerInfo->wi_dobalance = false;
1689                 MyWorkerInfo->wi_cost_delay = 0;
1690                 MyWorkerInfo->wi_cost_limit = 0;
1691                 MyWorkerInfo->wi_cost_limit_base = 0;
1692                 dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
1693                                                 &MyWorkerInfo->wi_links);
1694                 /* not mine anymore */
1695                 MyWorkerInfo = NULL;
1696
1697                 /*
1698                  * now that we're inactive, cause a rebalancing of the surviving
1699                  * workers
1700                  */
1701                 AutoVacuumShmem->av_signal[AutoVacRebalance] = true;
1702                 LWLockRelease(AutovacuumLock);
1703         }
1704 }
1705
1706 /*
1707  * Update the cost-based delay parameters, so that multiple workers consume
1708  * each a fraction of the total available I/O.
1709  */
1710 void
1711 AutoVacuumUpdateDelay(void)
1712 {
1713         if (MyWorkerInfo)
1714         {
1715                 VacuumCostDelay = MyWorkerInfo->wi_cost_delay;
1716                 VacuumCostLimit = MyWorkerInfo->wi_cost_limit;
1717         }
1718 }
1719
1720 /*
1721  * autovac_balance_cost
1722  *              Recalculate the cost limit setting for each active worker.
1723  *
1724  * Caller must hold the AutovacuumLock in exclusive mode.
1725  */
1726 static void
1727 autovac_balance_cost(void)
1728 {
1729         /*
1730          * The idea here is that we ration out I/O equally.  The amount of I/O
1731          * that a worker can consume is determined by cost_limit/cost_delay, so we
1732          * try to equalize those ratios rather than the raw limit settings.
1733          *
1734          * note: in cost_limit, zero also means use value from elsewhere, because
1735          * zero is not a valid value.
1736          */
1737         int                     vac_cost_limit = (autovacuum_vac_cost_limit > 0 ?
1738                                                                 autovacuum_vac_cost_limit : VacuumCostLimit);
1739         int                     vac_cost_delay = (autovacuum_vac_cost_delay >= 0 ?
1740                                                                 autovacuum_vac_cost_delay : VacuumCostDelay);
1741         double          cost_total;
1742         double          cost_avail;
1743         dlist_iter      iter;
1744
1745         /* not set? nothing to do */
1746         if (vac_cost_limit <= 0 || vac_cost_delay <= 0)
1747                 return;
1748
1749         /* calculate the total base cost limit of participating active workers */
1750         cost_total = 0.0;
1751         dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1752         {
1753                 WorkerInfo      worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1754
1755                 if (worker->wi_proc != NULL &&
1756                         worker->wi_dobalance &&
1757                         worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1758                         cost_total +=
1759                                 (double) worker->wi_cost_limit_base / worker->wi_cost_delay;
1760         }
1761
1762         /* there are no cost limits -- nothing to do */
1763         if (cost_total <= 0)
1764                 return;
1765
1766         /*
1767          * Adjust cost limit of each active worker to balance the total of cost
1768          * limit to autovacuum_vacuum_cost_limit.
1769          */
1770         cost_avail = (double) vac_cost_limit / vac_cost_delay;
1771         dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1772         {
1773                 WorkerInfo      worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1774
1775                 if (worker->wi_proc != NULL &&
1776                         worker->wi_dobalance &&
1777                         worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1778                 {
1779                         int                     limit = (int)
1780                         (cost_avail * worker->wi_cost_limit_base / cost_total);
1781
1782                         /*
1783                          * We put a lower bound of 1 on the cost_limit, to avoid division-
1784                          * by-zero in the vacuum code.  Also, in case of roundoff trouble
1785                          * in these calculations, let's be sure we don't ever set
1786                          * cost_limit to more than the base value.
1787                          */
1788                         worker->wi_cost_limit = Max(Min(limit,
1789                                                                                         worker->wi_cost_limit_base),
1790                                                                                 1);
1791                 }
1792
1793                 if (worker->wi_proc != NULL)
1794                         elog(DEBUG2, "autovac_balance_cost(pid=%u db=%u, rel=%u, dobalance=%s cost_limit=%d, cost_limit_base=%d, cost_delay=%d)",
1795                                  worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid,
1796                                  worker->wi_dobalance ? "yes" : "no",
1797                                  worker->wi_cost_limit, worker->wi_cost_limit_base,
1798                                  worker->wi_cost_delay);
1799         }
1800 }
1801
1802 /*
1803  * get_database_list
1804  *              Return a list of all databases found in pg_database.
1805  *
1806  * The list and associated data is allocated in the caller's memory context,
1807  * which is in charge of ensuring that it's properly cleaned up afterwards.
1808  *
1809  * Note: this is the only function in which the autovacuum launcher uses a
1810  * transaction.  Although we aren't attached to any particular database and
1811  * therefore can't access most catalogs, we do have enough infrastructure
1812  * to do a seqscan on pg_database.
1813  */
1814 static List *
1815 get_database_list(void)
1816 {
1817         List       *dblist = NIL;
1818         Relation        rel;
1819         HeapScanDesc scan;
1820         HeapTuple       tup;
1821         MemoryContext resultcxt;
1822
1823         /* This is the context that we will allocate our output data in */
1824         resultcxt = CurrentMemoryContext;
1825
1826         /*
1827          * Start a transaction so we can access pg_database, and get a snapshot.
1828          * We don't have a use for the snapshot itself, but we're interested in
1829          * the secondary effect that it sets RecentGlobalXmin.  (This is critical
1830          * for anything that reads heap pages, because HOT may decide to prune
1831          * them even if the process doesn't attempt to modify any tuples.)
1832          */
1833         StartTransactionCommand();
1834         (void) GetTransactionSnapshot();
1835
1836         rel = heap_open(DatabaseRelationId, AccessShareLock);
1837         scan = heap_beginscan_catalog(rel, 0, NULL);
1838
1839         while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
1840         {
1841                 Form_pg_database pgdatabase = (Form_pg_database) GETSTRUCT(tup);
1842                 avw_dbase  *avdb;
1843                 MemoryContext oldcxt;
1844
1845                 /*
1846                  * Allocate our results in the caller's context, not the
1847                  * transaction's. We do this inside the loop, and restore the original
1848                  * context at the end, so that leaky things like heap_getnext() are
1849                  * not called in a potentially long-lived context.
1850                  */
1851                 oldcxt = MemoryContextSwitchTo(resultcxt);
1852
1853                 avdb = (avw_dbase *) palloc(sizeof(avw_dbase));
1854
1855                 avdb->adw_datid = HeapTupleGetOid(tup);
1856                 avdb->adw_name = pstrdup(NameStr(pgdatabase->datname));
1857                 avdb->adw_frozenxid = pgdatabase->datfrozenxid;
1858                 avdb->adw_minmulti = pgdatabase->datminmxid;
1859                 /* this gets set later: */
1860                 avdb->adw_entry = NULL;
1861
1862                 dblist = lappend(dblist, avdb);
1863                 MemoryContextSwitchTo(oldcxt);
1864         }
1865
1866         heap_endscan(scan);
1867         heap_close(rel, AccessShareLock);
1868
1869         CommitTransactionCommand();
1870
1871         return dblist;
1872 }
1873
1874 /*
1875  * Process a database table-by-table
1876  *
1877  * Note that CHECK_FOR_INTERRUPTS is supposed to be used in certain spots in
1878  * order not to ignore shutdown commands for too long.
1879  */
1880 static void
1881 do_autovacuum(void)
1882 {
1883         Relation        classRel;
1884         HeapTuple       tuple;
1885         HeapScanDesc relScan;
1886         Form_pg_database dbForm;
1887         List       *table_oids = NIL;
1888         HASHCTL         ctl;
1889         HTAB       *table_toast_map;
1890         ListCell   *volatile cell;
1891         PgStat_StatDBEntry *shared;
1892         PgStat_StatDBEntry *dbentry;
1893         BufferAccessStrategy bstrategy;
1894         ScanKeyData key;
1895         TupleDesc       pg_class_desc;
1896         int                     effective_multixact_freeze_max_age;
1897
1898         /*
1899          * StartTransactionCommand and CommitTransactionCommand will automatically
1900          * switch to other contexts.  We need this one to keep the list of
1901          * relations to vacuum/analyze across transactions.
1902          */
1903         AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
1904                                                                                   "AV worker",
1905                                                                                   ALLOCSET_DEFAULT_MINSIZE,
1906                                                                                   ALLOCSET_DEFAULT_INITSIZE,
1907                                                                                   ALLOCSET_DEFAULT_MAXSIZE);
1908         MemoryContextSwitchTo(AutovacMemCxt);
1909
1910         /*
1911          * may be NULL if we couldn't find an entry (only happens if we are
1912          * forcing a vacuum for anti-wrap purposes).
1913          */
1914         dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
1915
1916         /* Start a transaction so our commands have one to play into. */
1917         StartTransactionCommand();
1918
1919         /*
1920          * Clean up any dead statistics collector entries for this DB. We always
1921          * want to do this exactly once per DB-processing cycle, even if we find
1922          * nothing worth vacuuming in the database.
1923          */
1924         pgstat_vacuum_stat();
1925
1926         /*
1927          * Compute the multixact age for which freezing is urgent.  This is
1928          * normally autovacuum_multixact_freeze_max_age, but may be less if we are
1929          * short of multixact member space.
1930          */
1931         effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold();
1932
1933         /*
1934          * Find the pg_database entry and select the default freeze ages. We use
1935          * zero in template and nonconnectable databases, else the system-wide
1936          * default.
1937          */
1938         tuple = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
1939         if (!HeapTupleIsValid(tuple))
1940                 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
1941         dbForm = (Form_pg_database) GETSTRUCT(tuple);
1942
1943         if (dbForm->datistemplate || !dbForm->datallowconn)
1944         {
1945                 default_freeze_min_age = 0;
1946                 default_freeze_table_age = 0;
1947                 default_multixact_freeze_min_age = 0;
1948                 default_multixact_freeze_table_age = 0;
1949         }
1950         else
1951         {
1952                 default_freeze_min_age = vacuum_freeze_min_age;
1953                 default_freeze_table_age = vacuum_freeze_table_age;
1954                 default_multixact_freeze_min_age = vacuum_multixact_freeze_min_age;
1955                 default_multixact_freeze_table_age = vacuum_multixact_freeze_table_age;
1956         }
1957
1958         ReleaseSysCache(tuple);
1959
1960         /* StartTransactionCommand changed elsewhere */
1961         MemoryContextSwitchTo(AutovacMemCxt);
1962
1963         /* The database hash where pgstat keeps shared relations */
1964         shared = pgstat_fetch_stat_dbentry(InvalidOid);
1965
1966         classRel = heap_open(RelationRelationId, AccessShareLock);
1967
1968         /* create a copy so we can use it after closing pg_class */
1969         pg_class_desc = CreateTupleDescCopy(RelationGetDescr(classRel));
1970
1971         /* create hash table for toast <-> main relid mapping */
1972         MemSet(&ctl, 0, sizeof(ctl));
1973         ctl.keysize = sizeof(Oid);
1974         ctl.entrysize = sizeof(av_relation);
1975
1976         table_toast_map = hash_create("TOAST to main relid map",
1977                                                                   100,
1978                                                                   &ctl,
1979                                                                   HASH_ELEM | HASH_BLOBS);
1980
1981         /*
1982          * Scan pg_class to determine which tables to vacuum.
1983          *
1984          * We do this in two passes: on the first one we collect the list of plain
1985          * relations and materialized views, and on the second one we collect
1986          * TOAST tables. The reason for doing the second pass is that during it we
1987          * want to use the main relation's pg_class.reloptions entry if the TOAST
1988          * table does not have any, and we cannot obtain it unless we know
1989          * beforehand what's the main  table OID.
1990          *
1991          * We need to check TOAST tables separately because in cases with short,
1992          * wide tables there might be proportionally much more activity in the
1993          * TOAST table than in its parent.
1994          */
1995         relScan = heap_beginscan_catalog(classRel, 0, NULL);
1996
1997         /*
1998          * On the first pass, we collect main tables to vacuum, and also the main
1999          * table relid to TOAST relid mapping.
2000          */
2001         while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2002         {
2003                 Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2004                 PgStat_StatTabEntry *tabentry;
2005                 AutoVacOpts *relopts;
2006                 Oid                     relid;
2007                 bool            dovacuum;
2008                 bool            doanalyze;
2009                 bool            wraparound;
2010
2011                 if (classForm->relkind != RELKIND_RELATION &&
2012                         classForm->relkind != RELKIND_MATVIEW)
2013                         continue;
2014
2015                 relid = HeapTupleGetOid(tuple);
2016
2017                 /* Fetch reloptions and the pgstat entry for this table */
2018                 relopts = extract_autovac_opts(tuple, pg_class_desc);
2019                 tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2020                                                                                          shared, dbentry);
2021
2022                 /* Check if it needs vacuum or analyze */
2023                 relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2024                                                                   effective_multixact_freeze_max_age,
2025                                                                   &dovacuum, &doanalyze, &wraparound);
2026
2027                 /*
2028                  * Check if it is a temp table (presumably, of some other backend's).
2029                  * We cannot safely process other backends' temp tables.
2030                  */
2031                 if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2032                 {
2033                         int                     backendID;
2034
2035                         backendID = GetTempNamespaceBackendId(classForm->relnamespace);
2036
2037                         /* We just ignore it if the owning backend is still active */
2038                         if (backendID == MyBackendId || BackendIdGetProc(backendID) == NULL)
2039                         {
2040                                 /*
2041                                  * We found an orphan temp table (which was probably left
2042                                  * behind by a crashed backend).  If it's so old as to need
2043                                  * vacuum for wraparound, forcibly drop it.  Otherwise just
2044                                  * log a complaint.
2045                                  */
2046                                 if (wraparound)
2047                                 {
2048                                         ObjectAddress object;
2049
2050                                         ereport(LOG,
2051                                                         (errmsg("autovacuum: dropping orphan temp table \"%s\".\"%s\" in database \"%s\"",
2052                                                                  get_namespace_name(classForm->relnamespace),
2053                                                                         NameStr(classForm->relname),
2054                                                                         get_database_name(MyDatabaseId))));
2055                                         object.classId = RelationRelationId;
2056                                         object.objectId = relid;
2057                                         object.objectSubId = 0;
2058                                         performDeletion(&object, DROP_CASCADE, PERFORM_DELETION_INTERNAL);
2059                                 }
2060                                 else
2061                                 {
2062                                         ereport(LOG,
2063                                                         (errmsg("autovacuum: found orphan temp table \"%s\".\"%s\" in database \"%s\"",
2064                                                                  get_namespace_name(classForm->relnamespace),
2065                                                                         NameStr(classForm->relname),
2066                                                                         get_database_name(MyDatabaseId))));
2067                                 }
2068                         }
2069                 }
2070                 else
2071                 {
2072                         /* relations that need work are added to table_oids */
2073                         if (dovacuum || doanalyze)
2074                                 table_oids = lappend_oid(table_oids, relid);
2075
2076                         /*
2077                          * Remember the association for the second pass.  Note: we must do
2078                          * this even if the table is going to be vacuumed, because we
2079                          * don't automatically vacuum toast tables along the parent table.
2080                          */
2081                         if (OidIsValid(classForm->reltoastrelid))
2082                         {
2083                                 av_relation *hentry;
2084                                 bool            found;
2085
2086                                 hentry = hash_search(table_toast_map,
2087                                                                          &classForm->reltoastrelid,
2088                                                                          HASH_ENTER, &found);
2089
2090                                 if (!found)
2091                                 {
2092                                         /* hash_search already filled in the key */
2093                                         hentry->ar_relid = relid;
2094                                         hentry->ar_hasrelopts = false;
2095                                         if (relopts != NULL)
2096                                         {
2097                                                 hentry->ar_hasrelopts = true;
2098                                                 memcpy(&hentry->ar_reloptions, relopts,
2099                                                            sizeof(AutoVacOpts));
2100                                         }
2101                                 }
2102                         }
2103                 }
2104         }
2105
2106         heap_endscan(relScan);
2107
2108         /* second pass: check TOAST tables */
2109         ScanKeyInit(&key,
2110                                 Anum_pg_class_relkind,
2111                                 BTEqualStrategyNumber, F_CHAREQ,
2112                                 CharGetDatum(RELKIND_TOASTVALUE));
2113
2114         relScan = heap_beginscan_catalog(classRel, 1, &key);
2115         while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2116         {
2117                 Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2118                 PgStat_StatTabEntry *tabentry;
2119                 Oid                     relid;
2120                 AutoVacOpts *relopts = NULL;
2121                 bool            dovacuum;
2122                 bool            doanalyze;
2123                 bool            wraparound;
2124
2125                 /*
2126                  * We cannot safely process other backends' temp tables, so skip 'em.
2127                  */
2128                 if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2129                         continue;
2130
2131                 relid = HeapTupleGetOid(tuple);
2132
2133                 /*
2134                  * fetch reloptions -- if this toast table does not have them, try the
2135                  * main rel
2136                  */
2137                 relopts = extract_autovac_opts(tuple, pg_class_desc);
2138                 if (relopts == NULL)
2139                 {
2140                         av_relation *hentry;
2141                         bool            found;
2142
2143                         hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2144                         if (found && hentry->ar_hasrelopts)
2145                                 relopts = &hentry->ar_reloptions;
2146                 }
2147
2148                 /* Fetch the pgstat entry for this table */
2149                 tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2150                                                                                          shared, dbentry);
2151
2152                 relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2153                                                                   effective_multixact_freeze_max_age,
2154                                                                   &dovacuum, &doanalyze, &wraparound);
2155
2156                 /* ignore analyze for toast tables */
2157                 if (dovacuum)
2158                         table_oids = lappend_oid(table_oids, relid);
2159         }
2160
2161         heap_endscan(relScan);
2162         heap_close(classRel, AccessShareLock);
2163
2164         /*
2165          * Create a buffer access strategy object for VACUUM to use.  We want to
2166          * use the same one across all the vacuum operations we perform, since the
2167          * point is for VACUUM not to blow out the shared cache.
2168          */
2169         bstrategy = GetAccessStrategy(BAS_VACUUM);
2170
2171         /*
2172          * create a memory context to act as fake PortalContext, so that the
2173          * contexts created in the vacuum code are cleaned up for each table.
2174          */
2175         PortalContext = AllocSetContextCreate(AutovacMemCxt,
2176                                                                                   "Autovacuum Portal",
2177                                                                                   ALLOCSET_DEFAULT_INITSIZE,
2178                                                                                   ALLOCSET_DEFAULT_MINSIZE,
2179                                                                                   ALLOCSET_DEFAULT_MAXSIZE);
2180
2181         /*
2182          * Perform operations on collected tables.
2183          */
2184         foreach(cell, table_oids)
2185         {
2186                 Oid                     relid = lfirst_oid(cell);
2187                 autovac_table *tab;
2188                 bool            skipit;
2189                 int                     stdVacuumCostDelay;
2190                 int                     stdVacuumCostLimit;
2191                 dlist_iter      iter;
2192
2193                 CHECK_FOR_INTERRUPTS();
2194
2195                 /*
2196                  * Check for config changes before processing each collected table.
2197                  */
2198                 if (got_SIGHUP)
2199                 {
2200                         got_SIGHUP = false;
2201                         ProcessConfigFile(PGC_SIGHUP);
2202
2203                         /*
2204                          * You might be tempted to bail out if we see autovacuum is now
2205                          * disabled.  Must resist that temptation -- this might be a
2206                          * for-wraparound emergency worker, in which case that would be
2207                          * entirely inappropriate.
2208                          */
2209                 }
2210
2211                 /*
2212                  * hold schedule lock from here until we're sure that this table still
2213                  * needs vacuuming.  We also need the AutovacuumLock to walk the
2214                  * worker array, but we'll let go of that one quickly.
2215                  */
2216                 LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2217                 LWLockAcquire(AutovacuumLock, LW_SHARED);
2218
2219                 /*
2220                  * Check whether the table is being vacuumed concurrently by another
2221                  * worker.
2222                  */
2223                 skipit = false;
2224                 dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
2225                 {
2226                         WorkerInfo      worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
2227
2228                         /* ignore myself */
2229                         if (worker == MyWorkerInfo)
2230                                 continue;
2231
2232                         /* ignore workers in other databases */
2233                         if (worker->wi_dboid != MyDatabaseId)
2234                                 continue;
2235
2236                         if (worker->wi_tableoid == relid)
2237                         {
2238                                 skipit = true;
2239                                 break;
2240                         }
2241                 }
2242                 LWLockRelease(AutovacuumLock);
2243                 if (skipit)
2244                 {
2245                         LWLockRelease(AutovacuumScheduleLock);
2246                         continue;
2247                 }
2248
2249                 /*
2250                  * Check whether pgstat data still says we need to vacuum this table.
2251                  * It could have changed if something else processed the table while
2252                  * we weren't looking.
2253                  *
2254                  * Note: we have a special case in pgstat code to ensure that the
2255                  * stats we read are as up-to-date as possible, to avoid the problem
2256                  * that somebody just finished vacuuming this table.  The window to
2257                  * the race condition is not closed but it is very small.
2258                  */
2259                 MemoryContextSwitchTo(AutovacMemCxt);
2260                 tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc,
2261                                                                         effective_multixact_freeze_max_age);
2262                 if (tab == NULL)
2263                 {
2264                         /* someone else vacuumed the table, or it went away */
2265                         LWLockRelease(AutovacuumScheduleLock);
2266                         continue;
2267                 }
2268
2269                 /*
2270                  * Ok, good to go.  Store the table in shared memory before releasing
2271                  * the lock so that other workers don't vacuum it concurrently.
2272                  */
2273                 MyWorkerInfo->wi_tableoid = relid;
2274                 LWLockRelease(AutovacuumScheduleLock);
2275
2276                 /*
2277                  * Remember the prevailing values of the vacuum cost GUCs.  We have to
2278                  * restore these at the bottom of the loop, else we'll compute wrong
2279                  * values in the next iteration of autovac_balance_cost().
2280                  */
2281                 stdVacuumCostDelay = VacuumCostDelay;
2282                 stdVacuumCostLimit = VacuumCostLimit;
2283
2284                 /* Must hold AutovacuumLock while mucking with cost balance info */
2285                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2286
2287                 /* advertise my cost delay parameters for the balancing algorithm */
2288                 MyWorkerInfo->wi_dobalance = tab->at_dobalance;
2289                 MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay;
2290                 MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit;
2291                 MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit;
2292
2293                 /* do a balance */
2294                 autovac_balance_cost();
2295
2296                 /* set the active cost parameters from the result of that */
2297                 AutoVacuumUpdateDelay();
2298
2299                 /* done */
2300                 LWLockRelease(AutovacuumLock);
2301
2302                 /* clean up memory before each iteration */
2303                 MemoryContextResetAndDeleteChildren(PortalContext);
2304
2305                 /*
2306                  * Save the relation name for a possible error message, to avoid a
2307                  * catalog lookup in case of an error.  If any of these return NULL,
2308                  * then the relation has been dropped since last we checked; skip it.
2309                  * Note: they must live in a long-lived memory context because we call
2310                  * vacuum and analyze in different transactions.
2311                  */
2312
2313                 tab->at_relname = get_rel_name(tab->at_relid);
2314                 tab->at_nspname = get_namespace_name(get_rel_namespace(tab->at_relid));
2315                 tab->at_datname = get_database_name(MyDatabaseId);
2316                 if (!tab->at_relname || !tab->at_nspname || !tab->at_datname)
2317                         goto deleted;
2318
2319                 /*
2320                  * We will abort vacuuming the current table if something errors out,
2321                  * and continue with the next one in schedule; in particular, this
2322                  * happens if we are interrupted with SIGINT.
2323                  */
2324                 PG_TRY();
2325                 {
2326                         /* have at it */
2327                         MemoryContextSwitchTo(TopTransactionContext);
2328                         autovacuum_do_vac_analyze(tab, bstrategy);
2329
2330                         /*
2331                          * Clear a possible query-cancel signal, to avoid a late reaction
2332                          * to an automatically-sent signal because of vacuuming the
2333                          * current table (we're done with it, so it would make no sense to
2334                          * cancel at this point.)
2335                          */
2336                         QueryCancelPending = false;
2337                 }
2338                 PG_CATCH();
2339                 {
2340                         /*
2341                          * Abort the transaction, start a new one, and proceed with the
2342                          * next table in our list.
2343                          */
2344                         HOLD_INTERRUPTS();
2345                         if (tab->at_vacoptions & VACOPT_VACUUM)
2346                                 errcontext("automatic vacuum of table \"%s.%s.%s\"",
2347                                                    tab->at_datname, tab->at_nspname, tab->at_relname);
2348                         else
2349                                 errcontext("automatic analyze of table \"%s.%s.%s\"",
2350                                                    tab->at_datname, tab->at_nspname, tab->at_relname);
2351                         EmitErrorReport();
2352
2353                         /* this resets the PGXACT flags too */
2354                         AbortOutOfAnyTransaction();
2355                         FlushErrorState();
2356                         MemoryContextResetAndDeleteChildren(PortalContext);
2357
2358                         /* restart our transaction for the following operations */
2359                         StartTransactionCommand();
2360                         RESUME_INTERRUPTS();
2361                 }
2362                 PG_END_TRY();
2363
2364                 /* the PGXACT flags are reset at the next end of transaction */
2365
2366                 /* be tidy */
2367 deleted:
2368                 if (tab->at_datname != NULL)
2369                         pfree(tab->at_datname);
2370                 if (tab->at_nspname != NULL)
2371                         pfree(tab->at_nspname);
2372                 if (tab->at_relname != NULL)
2373                         pfree(tab->at_relname);
2374                 pfree(tab);
2375
2376                 /*
2377                  * Remove my info from shared memory.  We could, but intentionally
2378                  * don't, clear wi_cost_limit and friends --- this is on the
2379                  * assumption that we probably have more to do with similar cost
2380                  * settings, so we don't want to give up our share of I/O for a very
2381                  * short interval and thereby thrash the global balance.
2382                  */
2383                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2384                 MyWorkerInfo->wi_tableoid = InvalidOid;
2385                 LWLockRelease(AutovacuumLock);
2386
2387                 /* restore vacuum cost GUCs for the next iteration */
2388                 VacuumCostDelay = stdVacuumCostDelay;
2389                 VacuumCostLimit = stdVacuumCostLimit;
2390         }
2391
2392         /*
2393          * We leak table_toast_map here (among other things), but since we're
2394          * going away soon, it's not a problem.
2395          */
2396
2397         /*
2398          * Update pg_database.datfrozenxid, and truncate pg_clog if possible. We
2399          * only need to do this once, not after each table.
2400          */
2401         vac_update_datfrozenxid();
2402
2403         /* Finally close out the last transaction. */
2404         CommitTransactionCommand();
2405 }
2406
2407 /*
2408  * extract_autovac_opts
2409  *
2410  * Given a relation's pg_class tuple, return the AutoVacOpts portion of
2411  * reloptions, if set; otherwise, return NULL.
2412  */
2413 static AutoVacOpts *
2414 extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc)
2415 {
2416         bytea      *relopts;
2417         AutoVacOpts *av;
2418
2419         Assert(((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_RELATION ||
2420                    ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW ||
2421                    ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE);
2422
2423         relopts = extractRelOptions(tup, pg_class_desc, InvalidOid);
2424         if (relopts == NULL)
2425                 return NULL;
2426
2427         av = palloc(sizeof(AutoVacOpts));
2428         memcpy(av, &(((StdRdOptions *) relopts)->autovacuum), sizeof(AutoVacOpts));
2429         pfree(relopts);
2430
2431         return av;
2432 }
2433
2434 /*
2435  * get_pgstat_tabentry_relid
2436  *
2437  * Fetch the pgstat entry of a table, either local to a database or shared.
2438  */
2439 static PgStat_StatTabEntry *
2440 get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
2441                                                   PgStat_StatDBEntry *dbentry)
2442 {
2443         PgStat_StatTabEntry *tabentry = NULL;
2444
2445         if (isshared)
2446         {
2447                 if (PointerIsValid(shared))
2448                         tabentry = hash_search(shared->tables, &relid,
2449                                                                    HASH_FIND, NULL);
2450         }
2451         else if (PointerIsValid(dbentry))
2452                 tabentry = hash_search(dbentry->tables, &relid,
2453                                                            HASH_FIND, NULL);
2454
2455         return tabentry;
2456 }
2457
2458 /*
2459  * table_recheck_autovac
2460  *
2461  * Recheck whether a table still needs vacuum or analyze.  Return value is a
2462  * valid autovac_table pointer if it does, NULL otherwise.
2463  *
2464  * Note that the returned autovac_table does not have the name fields set.
2465  */
2466 static autovac_table *
2467 table_recheck_autovac(Oid relid, HTAB *table_toast_map,
2468                                           TupleDesc pg_class_desc,
2469                                           int effective_multixact_freeze_max_age)
2470 {
2471         Form_pg_class classForm;
2472         HeapTuple       classTup;
2473         bool            dovacuum;
2474         bool            doanalyze;
2475         autovac_table *tab = NULL;
2476         PgStat_StatTabEntry *tabentry;
2477         PgStat_StatDBEntry *shared;
2478         PgStat_StatDBEntry *dbentry;
2479         bool            wraparound;
2480         AutoVacOpts *avopts;
2481
2482         /* use fresh stats */
2483         autovac_refresh_stats();
2484
2485         shared = pgstat_fetch_stat_dbentry(InvalidOid);
2486         dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
2487
2488         /* fetch the relation's relcache entry */
2489         classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
2490         if (!HeapTupleIsValid(classTup))
2491                 return NULL;
2492         classForm = (Form_pg_class) GETSTRUCT(classTup);
2493
2494         /*
2495          * Get the applicable reloptions.  If it is a TOAST table, try to get the
2496          * main table reloptions if the toast table itself doesn't have.
2497          */
2498         avopts = extract_autovac_opts(classTup, pg_class_desc);
2499         if (classForm->relkind == RELKIND_TOASTVALUE &&
2500                 avopts == NULL && table_toast_map != NULL)
2501         {
2502                 av_relation *hentry;
2503                 bool            found;
2504
2505                 hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2506                 if (found && hentry->ar_hasrelopts)
2507                         avopts = &hentry->ar_reloptions;
2508         }
2509
2510         /* fetch the pgstat table entry */
2511         tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2512                                                                                  shared, dbentry);
2513
2514         relation_needs_vacanalyze(relid, avopts, classForm, tabentry,
2515                                                           effective_multixact_freeze_max_age,
2516                                                           &dovacuum, &doanalyze, &wraparound);
2517
2518         /* ignore ANALYZE for toast tables */
2519         if (classForm->relkind == RELKIND_TOASTVALUE)
2520                 doanalyze = false;
2521
2522         /* OK, it needs something done */
2523         if (doanalyze || dovacuum)
2524         {
2525                 int                     freeze_min_age;
2526                 int                     freeze_table_age;
2527                 int                     multixact_freeze_min_age;
2528                 int                     multixact_freeze_table_age;
2529                 int                     vac_cost_limit;
2530                 int                     vac_cost_delay;
2531                 int                     log_min_duration;
2532
2533                 /*
2534                  * Calculate the vacuum cost parameters and the freeze ages.  If there
2535                  * are options set in pg_class.reloptions, use them; in the case of a
2536                  * toast table, try the main table too.  Otherwise use the GUC
2537                  * defaults, autovacuum's own first and plain vacuum second.
2538                  */
2539
2540                 /* -1 in autovac setting means use plain vacuum_cost_delay */
2541                 vac_cost_delay = (avopts && avopts->vacuum_cost_delay >= 0)
2542                         ? avopts->vacuum_cost_delay
2543                         : (autovacuum_vac_cost_delay >= 0)
2544                         ? autovacuum_vac_cost_delay
2545                         : VacuumCostDelay;
2546
2547                 /* 0 or -1 in autovac setting means use plain vacuum_cost_limit */
2548                 vac_cost_limit = (avopts && avopts->vacuum_cost_limit > 0)
2549                         ? avopts->vacuum_cost_limit
2550                         : (autovacuum_vac_cost_limit > 0)
2551                         ? autovacuum_vac_cost_limit
2552                         : VacuumCostLimit;
2553
2554                 /* -1 in autovac setting means use log_autovacuum_min_duration */
2555                 log_min_duration = (avopts && avopts->log_min_duration >= 0)
2556                         ? avopts->log_min_duration
2557                         : Log_autovacuum_min_duration;
2558
2559                 /* these do not have autovacuum-specific settings */
2560                 freeze_min_age = (avopts && avopts->freeze_min_age >= 0)
2561                         ? avopts->freeze_min_age
2562                         : default_freeze_min_age;
2563
2564                 freeze_table_age = (avopts && avopts->freeze_table_age >= 0)
2565                         ? avopts->freeze_table_age
2566                         : default_freeze_table_age;
2567
2568                 multixact_freeze_min_age = (avopts &&
2569                                                                         avopts->multixact_freeze_min_age >= 0)
2570                         ? avopts->multixact_freeze_min_age
2571                         : default_multixact_freeze_min_age;
2572
2573                 multixact_freeze_table_age = (avopts &&
2574                                                                           avopts->multixact_freeze_table_age >= 0)
2575                         ? avopts->multixact_freeze_table_age
2576                         : default_multixact_freeze_table_age;
2577
2578                 tab = palloc(sizeof(autovac_table));
2579                 tab->at_relid = relid;
2580                 tab->at_vacoptions = VACOPT_SKIPTOAST |
2581                         (dovacuum ? VACOPT_VACUUM : 0) |
2582                         (doanalyze ? VACOPT_ANALYZE : 0) |
2583                         (!wraparound ? VACOPT_NOWAIT : 0);
2584                 tab->at_params.freeze_min_age = freeze_min_age;
2585                 tab->at_params.freeze_table_age = freeze_table_age;
2586                 tab->at_params.multixact_freeze_min_age = multixact_freeze_min_age;
2587                 tab->at_params.multixact_freeze_table_age = multixact_freeze_table_age;
2588                 tab->at_params.is_wraparound = wraparound;
2589                 tab->at_params.log_min_duration = log_min_duration;
2590                 tab->at_vacuum_cost_limit = vac_cost_limit;
2591                 tab->at_vacuum_cost_delay = vac_cost_delay;
2592                 tab->at_relname = NULL;
2593                 tab->at_nspname = NULL;
2594                 tab->at_datname = NULL;
2595
2596                 /*
2597                  * If any of the cost delay parameters has been set individually for
2598                  * this table, disable the balancing algorithm.
2599                  */
2600                 tab->at_dobalance =
2601                         !(avopts && (avopts->vacuum_cost_limit > 0 ||
2602                                                  avopts->vacuum_cost_delay > 0));
2603         }
2604
2605         heap_freetuple(classTup);
2606
2607         return tab;
2608 }
2609
2610 /*
2611  * relation_needs_vacanalyze
2612  *
2613  * Check whether a relation needs to be vacuumed or analyzed; return each into
2614  * "dovacuum" and "doanalyze", respectively.  Also return whether the vacuum is
2615  * being forced because of Xid or multixact wraparound.
2616  *
2617  * relopts is a pointer to the AutoVacOpts options (either for itself in the
2618  * case of a plain table, or for either itself or its parent table in the case
2619  * of a TOAST table), NULL if none; tabentry is the pgstats entry, which can be
2620  * NULL.
2621  *
2622  * A table needs to be vacuumed if the number of dead tuples exceeds a
2623  * threshold.  This threshold is calculated as
2624  *
2625  * threshold = vac_base_thresh + vac_scale_factor * reltuples
2626  *
2627  * For analyze, the analysis done is that the number of tuples inserted,
2628  * deleted and updated since the last analyze exceeds a threshold calculated
2629  * in the same fashion as above.  Note that the collector actually stores
2630  * the number of tuples (both live and dead) that there were as of the last
2631  * analyze.  This is asymmetric to the VACUUM case.
2632  *
2633  * We also force vacuum if the table's relfrozenxid is more than freeze_max_age
2634  * transactions back, and if its relminmxid is more than
2635  * multixact_freeze_max_age multixacts back.
2636  *
2637  * A table whose autovacuum_enabled option is false is
2638  * automatically skipped (unless we have to vacuum it due to freeze_max_age).
2639  * Thus autovacuum can be disabled for specific tables. Also, when the stats
2640  * collector does not have data about a table, it will be skipped.
2641  *
2642  * A table whose vac_base_thresh value is < 0 takes the base value from the
2643  * autovacuum_vacuum_threshold GUC variable.  Similarly, a vac_scale_factor
2644  * value < 0 is substituted with the value of
2645  * autovacuum_vacuum_scale_factor GUC variable.  Ditto for analyze.
2646  */
2647 static void
2648 relation_needs_vacanalyze(Oid relid,
2649                                                   AutoVacOpts *relopts,
2650                                                   Form_pg_class classForm,
2651                                                   PgStat_StatTabEntry *tabentry,
2652                                                   int effective_multixact_freeze_max_age,
2653  /* output params below */
2654                                                   bool *dovacuum,
2655                                                   bool *doanalyze,
2656                                                   bool *wraparound)
2657 {
2658         bool            force_vacuum;
2659         bool            av_enabled;
2660         float4          reltuples;              /* pg_class.reltuples */
2661
2662         /* constants from reloptions or GUC variables */
2663         int                     vac_base_thresh,
2664                                 anl_base_thresh;
2665         float4          vac_scale_factor,
2666                                 anl_scale_factor;
2667
2668         /* thresholds calculated from above constants */
2669         float4          vacthresh,
2670                                 anlthresh;
2671
2672         /* number of vacuum (resp. analyze) tuples at this time */
2673         float4          vactuples,
2674                                 anltuples;
2675
2676         /* freeze parameters */
2677         int                     freeze_max_age;
2678         int                     multixact_freeze_max_age;
2679         TransactionId xidForceLimit;
2680         MultiXactId multiForceLimit;
2681
2682         AssertArg(classForm != NULL);
2683         AssertArg(OidIsValid(relid));
2684
2685         /*
2686          * Determine vacuum/analyze equation parameters.  We have two possible
2687          * sources: the passed reloptions (which could be a main table or a toast
2688          * table), or the autovacuum GUC variables.
2689          */
2690
2691         /* -1 in autovac setting means use plain vacuum_cost_delay */
2692         vac_scale_factor = (relopts && relopts->vacuum_scale_factor >= 0)
2693                 ? relopts->vacuum_scale_factor
2694                 : autovacuum_vac_scale;
2695
2696         vac_base_thresh = (relopts && relopts->vacuum_threshold >= 0)
2697                 ? relopts->vacuum_threshold
2698                 : autovacuum_vac_thresh;
2699
2700         anl_scale_factor = (relopts && relopts->analyze_scale_factor >= 0)
2701                 ? relopts->analyze_scale_factor
2702                 : autovacuum_anl_scale;
2703
2704         anl_base_thresh = (relopts && relopts->analyze_threshold >= 0)
2705                 ? relopts->analyze_threshold
2706                 : autovacuum_anl_thresh;
2707
2708         freeze_max_age = (relopts && relopts->freeze_max_age >= 0)
2709                 ? Min(relopts->freeze_max_age, autovacuum_freeze_max_age)
2710                 : autovacuum_freeze_max_age;
2711
2712         multixact_freeze_max_age = (relopts && relopts->multixact_freeze_max_age >= 0)
2713                 ? Min(relopts->multixact_freeze_max_age, effective_multixact_freeze_max_age)
2714                 : effective_multixact_freeze_max_age;
2715
2716         av_enabled = (relopts ? relopts->enabled : true);
2717
2718         /* Force vacuum if table is at risk of wraparound */
2719         xidForceLimit = recentXid - freeze_max_age;
2720         if (xidForceLimit < FirstNormalTransactionId)
2721                 xidForceLimit -= FirstNormalTransactionId;
2722         force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) &&
2723                                         TransactionIdPrecedes(classForm->relfrozenxid,
2724                                                                                   xidForceLimit));
2725         if (!force_vacuum)
2726         {
2727                 multiForceLimit = recentMulti - multixact_freeze_max_age;
2728                 if (multiForceLimit < FirstMultiXactId)
2729                         multiForceLimit -= FirstMultiXactId;
2730                 force_vacuum = MultiXactIdPrecedes(classForm->relminmxid,
2731                                                                                    multiForceLimit);
2732         }
2733         *wraparound = force_vacuum;
2734
2735         /* User disabled it in pg_class.reloptions?  (But ignore if at risk) */
2736         if (!av_enabled && !force_vacuum)
2737         {
2738                 *doanalyze = false;
2739                 *dovacuum = false;
2740                 return;
2741         }
2742
2743         /*
2744          * If we found the table in the stats hash, and autovacuum is currently
2745          * enabled, make a threshold-based decision whether to vacuum and/or
2746          * analyze.  If autovacuum is currently disabled, we must be here for
2747          * anti-wraparound vacuuming only, so don't vacuum (or analyze) anything
2748          * that's not being forced.
2749          */
2750         if (PointerIsValid(tabentry) && AutoVacuumingActive())
2751         {
2752                 reltuples = classForm->reltuples;
2753                 vactuples = tabentry->n_dead_tuples;
2754                 anltuples = tabentry->changes_since_analyze;
2755
2756                 vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples;
2757                 anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples;
2758
2759                 /*
2760                  * Note that we don't need to take special consideration for stat
2761                  * reset, because if that happens, the last vacuum and analyze counts
2762                  * will be reset too.
2763                  */
2764                 elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), anl: %.0f (threshold %.0f)",
2765                          NameStr(classForm->relname),
2766                          vactuples, vacthresh, anltuples, anlthresh);
2767
2768                 /* Determine if this table needs vacuum or analyze. */
2769                 *dovacuum = force_vacuum || (vactuples > vacthresh);
2770                 *doanalyze = (anltuples > anlthresh);
2771         }
2772         else
2773         {
2774                 /*
2775                  * Skip a table not found in stat hash, unless we have to force vacuum
2776                  * for anti-wrap purposes.  If it's not acted upon, there's no need to
2777                  * vacuum it.
2778                  */
2779                 *dovacuum = force_vacuum;
2780                 *doanalyze = false;
2781         }
2782
2783         /* ANALYZE refuses to work with pg_statistics */
2784         if (relid == StatisticRelationId)
2785                 *doanalyze = false;
2786 }
2787
2788 /*
2789  * autovacuum_do_vac_analyze
2790  *              Vacuum and/or analyze the specified table
2791  */
2792 static void
2793 autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy)
2794 {
2795         RangeVar        rangevar;
2796
2797         /* Set up command parameters --- use local variables instead of palloc */
2798         MemSet(&rangevar, 0, sizeof(rangevar));
2799
2800         rangevar.schemaname = tab->at_nspname;
2801         rangevar.relname = tab->at_relname;
2802         rangevar.location = -1;
2803
2804         /* Let pgstat know what we're doing */
2805         autovac_report_activity(tab);
2806
2807         vacuum(tab->at_vacoptions, &rangevar, tab->at_relid, &tab->at_params, NIL,
2808                    bstrategy, true);
2809 }
2810
2811 /*
2812  * autovac_report_activity
2813  *              Report to pgstat what autovacuum is doing
2814  *
2815  * We send a SQL string corresponding to what the user would see if the
2816  * equivalent command was to be issued manually.
2817  *
2818  * Note we assume that we are going to report the next command as soon as we're
2819  * done with the current one, and exit right after the last one, so we don't
2820  * bother to report "<IDLE>" or some such.
2821  */
2822 static void
2823 autovac_report_activity(autovac_table *tab)
2824 {
2825 #define MAX_AUTOVAC_ACTIV_LEN (NAMEDATALEN * 2 + 56)
2826         char            activity[MAX_AUTOVAC_ACTIV_LEN];
2827         int                     len;
2828
2829         /* Report the command and possible options */
2830         if (tab->at_vacoptions & VACOPT_VACUUM)
2831                 snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
2832                                  "autovacuum: VACUUM%s",
2833                                  tab->at_vacoptions & VACOPT_ANALYZE ? " ANALYZE" : "");
2834         else
2835                 snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
2836                                  "autovacuum: ANALYZE");
2837
2838         /*
2839          * Report the qualified name of the relation.
2840          */
2841         len = strlen(activity);
2842
2843         snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
2844                          " %s.%s%s", tab->at_nspname, tab->at_relname,
2845                          tab->at_params.is_wraparound ? " (to prevent wraparound)" : "");
2846
2847         /* Set statement_timestamp() to current time for pg_stat_activity */
2848         SetCurrentStatementStartTimestamp();
2849
2850         pgstat_report_activity(STATE_RUNNING, activity);
2851 }
2852
2853 /*
2854  * AutoVacuumingActive
2855  *              Check GUC vars and report whether the autovacuum process should be
2856  *              running.
2857  */
2858 bool
2859 AutoVacuumingActive(void)
2860 {
2861         if (!autovacuum_start_daemon || !pgstat_track_counts)
2862                 return false;
2863         return true;
2864 }
2865
2866 /*
2867  * autovac_init
2868  *              This is called at postmaster initialization.
2869  *
2870  * All we do here is annoy the user if he got it wrong.
2871  */
2872 void
2873 autovac_init(void)
2874 {
2875         if (autovacuum_start_daemon && !pgstat_track_counts)
2876                 ereport(WARNING,
2877                                 (errmsg("autovacuum not started because of misconfiguration"),
2878                                  errhint("Enable the \"track_counts\" option.")));
2879 }
2880
2881 /*
2882  * IsAutoVacuum functions
2883  *              Return whether this is either a launcher autovacuum process or a worker
2884  *              process.
2885  */
2886 bool
2887 IsAutoVacuumLauncherProcess(void)
2888 {
2889         return am_autovacuum_launcher;
2890 }
2891
2892 bool
2893 IsAutoVacuumWorkerProcess(void)
2894 {
2895         return am_autovacuum_worker;
2896 }
2897
2898
2899 /*
2900  * AutoVacuumShmemSize
2901  *              Compute space needed for autovacuum-related shared memory
2902  */
2903 Size
2904 AutoVacuumShmemSize(void)
2905 {
2906         Size            size;
2907
2908         /*
2909          * Need the fixed struct and the array of WorkerInfoData.
2910          */
2911         size = sizeof(AutoVacuumShmemStruct);
2912         size = MAXALIGN(size);
2913         size = add_size(size, mul_size(autovacuum_max_workers,
2914                                                                    sizeof(WorkerInfoData)));
2915         return size;
2916 }
2917
2918 /*
2919  * AutoVacuumShmemInit
2920  *              Allocate and initialize autovacuum-related shared memory
2921  */
2922 void
2923 AutoVacuumShmemInit(void)
2924 {
2925         bool            found;
2926
2927         AutoVacuumShmem = (AutoVacuumShmemStruct *)
2928                 ShmemInitStruct("AutoVacuum Data",
2929                                                 AutoVacuumShmemSize(),
2930                                                 &found);
2931
2932         if (!IsUnderPostmaster)
2933         {
2934                 WorkerInfo      worker;
2935                 int                     i;
2936
2937                 Assert(!found);
2938
2939                 AutoVacuumShmem->av_launcherpid = 0;
2940                 dlist_init(&AutoVacuumShmem->av_freeWorkers);
2941                 dlist_init(&AutoVacuumShmem->av_runningWorkers);
2942                 AutoVacuumShmem->av_startingWorker = NULL;
2943
2944                 worker = (WorkerInfo) ((char *) AutoVacuumShmem +
2945                                                            MAXALIGN(sizeof(AutoVacuumShmemStruct)));
2946
2947                 /* initialize the WorkerInfo free list */
2948                 for (i = 0; i < autovacuum_max_workers; i++)
2949                         dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
2950                                                         &worker[i].wi_links);
2951         }
2952         else
2953                 Assert(found);
2954 }
2955
2956 /*
2957  * autovac_refresh_stats
2958  *              Refresh pgstats data for an autovacuum process
2959  *
2960  * Cause the next pgstats read operation to obtain fresh data, but throttle
2961  * such refreshing in the autovacuum launcher.  This is mostly to avoid
2962  * rereading the pgstats files too many times in quick succession when there
2963  * are many databases.
2964  *
2965  * Note: we avoid throttling in the autovac worker, as it would be
2966  * counterproductive in the recheck logic.
2967  */
2968 static void
2969 autovac_refresh_stats(void)
2970 {
2971         if (IsAutoVacuumLauncherProcess())
2972         {
2973                 static TimestampTz last_read = 0;
2974                 TimestampTz current_time;
2975
2976                 current_time = GetCurrentTimestamp();
2977
2978                 if (!TimestampDifferenceExceeds(last_read, current_time,
2979                                                                                 STATS_READ_DELAY))
2980                         return;
2981
2982                 last_read = current_time;
2983         }
2984
2985         pgstat_clear_snapshot();
2986 }