]> granicus.if.org Git - postgresql/blob - src/backend/postmaster/bgwriter.c
Gettext plural support
[postgresql] / src / backend / postmaster / bgwriter.c
1 /*-------------------------------------------------------------------------
2  *
3  * bgwriter.c
4  *
5  * The background writer (bgwriter) is new as of Postgres 8.0.  It attempts
6  * to keep regular backends from having to write out dirty shared buffers
7  * (which they would only do when needing to free a shared buffer to read in
8  * another page).  In the best scenario all writes from shared buffers will
9  * be issued by the background writer process.  However, regular backends are
10  * still empowered to issue writes if the bgwriter fails to maintain enough
11  * clean shared buffers.
12  *
13  * The bgwriter is also charged with handling all checkpoints.  It will
14  * automatically dispatch a checkpoint after a certain amount of time has
15  * elapsed since the last one, and it can be signaled to perform requested
16  * checkpoints as well.  (The GUC parameter that mandates a checkpoint every
17  * so many WAL segments is implemented by having backends signal the bgwriter
18  * when they fill WAL segments; the bgwriter itself doesn't watch for the
19  * condition.)
20  *
21  * The bgwriter is started by the postmaster as soon as the startup subprocess
22  * finishes.  It remains alive until the postmaster commands it to terminate.
23  * Normal termination is by SIGUSR2, which instructs the bgwriter to execute
24  * a shutdown checkpoint and then exit(0).      (All backends must be stopped
25  * before SIGUSR2 is issued!)  Emergency termination is by SIGQUIT; like any
26  * backend, the bgwriter will simply abort and exit on SIGQUIT.
27  *
28  * If the bgwriter exits unexpectedly, the postmaster treats that the same
29  * as a backend crash: shared memory may be corrupted, so remaining backends
30  * should be killed by SIGQUIT and then a recovery cycle started.  (Even if
31  * shared memory isn't corrupted, we have lost information about which
32  * files need to be fsync'd for the next checkpoint, and so a system
33  * restart needs to be forced.)
34  *
35  *
36  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
37  *
38  *
39  * IDENTIFICATION
40  *        $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.57 2009/03/26 22:26:06 petere Exp $
41  *
42  *-------------------------------------------------------------------------
43  */
44 #include "postgres.h"
45
46 #include <signal.h>
47 #include <sys/time.h>
48 #include <time.h>
49 #include <unistd.h>
50
51 #include "access/xlog_internal.h"
52 #include "catalog/pg_control.h"
53 #include "libpq/pqsignal.h"
54 #include "miscadmin.h"
55 #include "pgstat.h"
56 #include "postmaster/bgwriter.h"
57 #include "storage/bufmgr.h"
58 #include "storage/fd.h"
59 #include "storage/ipc.h"
60 #include "storage/lwlock.h"
61 #include "storage/pmsignal.h"
62 #include "storage/shmem.h"
63 #include "storage/smgr.h"
64 #include "storage/spin.h"
65 #include "tcop/tcopprot.h"
66 #include "utils/guc.h"
67 #include "utils/memutils.h"
68 #include "utils/resowner.h"
69
70
71 /*----------
72  * Shared memory area for communication between bgwriter and backends
73  *
74  * The ckpt counters allow backends to watch for completion of a checkpoint
75  * request they send.  Here's how it works:
76  *      * At start of a checkpoint, bgwriter reads (and clears) the request flags
77  *        and increments ckpt_started, while holding ckpt_lck.
78  *      * On completion of a checkpoint, bgwriter sets ckpt_done to
79  *        equal ckpt_started.
80  *      * On failure of a checkpoint, bgwriter increments ckpt_failed
81  *        and sets ckpt_done to equal ckpt_started.
82  *
83  * The algorithm for backends is:
84  *      1. Record current values of ckpt_failed and ckpt_started, and
85  *         set request flags, while holding ckpt_lck.
86  *      2. Send signal to request checkpoint.
87  *      3. Sleep until ckpt_started changes.  Now you know a checkpoint has
88  *         begun since you started this algorithm (although *not* that it was
89  *         specifically initiated by your signal), and that it is using your flags.
90  *      4. Record new value of ckpt_started.
91  *      5. Sleep until ckpt_done >= saved value of ckpt_started.  (Use modulo
92  *         arithmetic here in case counters wrap around.)  Now you know a
93  *         checkpoint has started and completed, but not whether it was
94  *         successful.
95  *      6. If ckpt_failed is different from the originally saved value,
96  *         assume request failed; otherwise it was definitely successful.
97  *
98  * ckpt_flags holds the OR of the checkpoint request flags sent by all
99  * requesting backends since the last checkpoint start.  The flags are
100  * chosen so that OR'ing is the correct way to combine multiple requests.
101  *
102  * num_backend_writes is used to count the number of buffer writes performed
103  * by non-bgwriter processes.  This counter should be wide enough that it
104  * can't overflow during a single bgwriter cycle.
105  *
106  * The requests array holds fsync requests sent by backends and not yet
107  * absorbed by the bgwriter.
108  *
109  * Unlike the checkpoint fields, num_backend_writes and the requests
110  * fields are protected by BgWriterCommLock.
111  *----------
112  */
113 typedef struct
114 {
115         RelFileNode rnode;
116         ForkNumber forknum;
117         BlockNumber segno;                      /* see md.c for special values */
118         /* might add a real request-type field later; not needed yet */
119 } BgWriterRequest;
120
121 typedef struct
122 {
123         pid_t           bgwriter_pid;   /* PID of bgwriter (0 if not started) */
124
125         slock_t         ckpt_lck;               /* protects all the ckpt_* fields */
126
127         int                     ckpt_started;   /* advances when checkpoint starts */
128         int                     ckpt_done;              /* advances when checkpoint done */
129         int                     ckpt_failed;    /* advances when checkpoint fails */
130
131         int                     ckpt_flags;             /* checkpoint flags, as defined in xlog.h */
132
133         uint32          num_backend_writes;             /* counts non-bgwriter buffer writes */
134
135         int                     num_requests;   /* current # of requests */
136         int                     max_requests;   /* allocated array size */
137         BgWriterRequest requests[1];    /* VARIABLE LENGTH ARRAY */
138 } BgWriterShmemStruct;
139
140 static BgWriterShmemStruct *BgWriterShmem;
141
142 /* interval for calling AbsorbFsyncRequests in CheckpointWriteDelay */
143 #define WRITES_PER_ABSORB               1000
144
145 /*
146  * GUC parameters
147  */
148 int                     BgWriterDelay = 200;
149 int                     CheckPointTimeout = 300;
150 int                     CheckPointWarning = 30;
151 double          CheckPointCompletionTarget = 0.5;
152
153 /*
154  * Flags set by interrupt handlers for later service in the main loop.
155  */
156 static volatile sig_atomic_t got_SIGHUP = false;
157 static volatile sig_atomic_t checkpoint_requested = false;
158 static volatile sig_atomic_t shutdown_requested = false;
159
160 /*
161  * Private state
162  */
163 static bool am_bg_writer = false;
164
165 static bool ckpt_active = false;
166
167 /* these values are valid when ckpt_active is true: */
168 static pg_time_t ckpt_start_time;
169 static XLogRecPtr ckpt_start_recptr;
170 static double ckpt_cached_elapsed;
171
172 static pg_time_t last_checkpoint_time;
173 static pg_time_t last_xlog_switch_time;
174
175 /* Prototypes for private functions */
176
177 static void CheckArchiveTimeout(void);
178 static void BgWriterNap(void);
179 static bool IsCheckpointOnSchedule(double progress);
180 static bool ImmediateCheckpointRequested(void);
181
182 /* Signal handlers */
183
184 static void bg_quickdie(SIGNAL_ARGS);
185 static void BgSigHupHandler(SIGNAL_ARGS);
186 static void ReqCheckpointHandler(SIGNAL_ARGS);
187 static void ReqShutdownHandler(SIGNAL_ARGS);
188
189
190 /*
191  * Main entry point for bgwriter process
192  *
193  * This is invoked from BootstrapMain, which has already created the basic
194  * execution environment, but not enabled signals yet.
195  */
196 void
197 BackgroundWriterMain(void)
198 {
199         sigjmp_buf      local_sigjmp_buf;
200         MemoryContext bgwriter_context;
201
202         BgWriterShmem->bgwriter_pid = MyProcPid;
203         am_bg_writer = true;
204
205         /*
206          * If possible, make this process a group leader, so that the postmaster
207          * can signal any child processes too.  (bgwriter probably never has any
208          * child processes, but for consistency we make all postmaster child
209          * processes do this.)
210          */
211 #ifdef HAVE_SETSID
212         if (setsid() < 0)
213                 elog(FATAL, "setsid() failed: %m");
214 #endif
215
216         /*
217          * Properly accept or ignore signals the postmaster might send us
218          *
219          * Note: we deliberately ignore SIGTERM, because during a standard Unix
220          * system shutdown cycle, init will SIGTERM all processes at once.      We
221          * want to wait for the backends to exit, whereupon the postmaster will
222          * tell us it's okay to shut down (via SIGUSR2).
223          *
224          * SIGUSR1 is presently unused; keep it spare in case someday we want this
225          * process to participate in sinval messaging.
226          */
227         pqsignal(SIGHUP, BgSigHupHandler);      /* set flag to read config file */
228         pqsignal(SIGINT, ReqCheckpointHandler);         /* request checkpoint */
229         pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */
230         pqsignal(SIGQUIT, bg_quickdie);         /* hard crash time */
231         pqsignal(SIGALRM, SIG_IGN);
232         pqsignal(SIGPIPE, SIG_IGN);
233         pqsignal(SIGUSR1, SIG_IGN); /* reserve for sinval */
234         pqsignal(SIGUSR2, ReqShutdownHandler);          /* request shutdown */
235
236         /*
237          * Reset some signals that are accepted by postmaster but not here
238          */
239         pqsignal(SIGCHLD, SIG_DFL);
240         pqsignal(SIGTTIN, SIG_DFL);
241         pqsignal(SIGTTOU, SIG_DFL);
242         pqsignal(SIGCONT, SIG_DFL);
243         pqsignal(SIGWINCH, SIG_DFL);
244
245         /* We allow SIGQUIT (quickdie) at all times */
246 #ifdef HAVE_SIGPROCMASK
247         sigdelset(&BlockSig, SIGQUIT);
248 #else
249         BlockSig &= ~(sigmask(SIGQUIT));
250 #endif
251
252         /*
253          * Initialize so that first time-driven event happens at the correct time.
254          */
255         last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
256
257         /*
258          * Create a resource owner to keep track of our resources (currently only
259          * buffer pins).
260          */
261         CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer");
262
263         /*
264          * Create a memory context that we will do all our work in.  We do this so
265          * that we can reset the context during error recovery and thereby avoid
266          * possible memory leaks.  Formerly this code just ran in
267          * TopMemoryContext, but resetting that would be a really bad idea.
268          */
269         bgwriter_context = AllocSetContextCreate(TopMemoryContext,
270                                                                                          "Background Writer",
271                                                                                          ALLOCSET_DEFAULT_MINSIZE,
272                                                                                          ALLOCSET_DEFAULT_INITSIZE,
273                                                                                          ALLOCSET_DEFAULT_MAXSIZE);
274         MemoryContextSwitchTo(bgwriter_context);
275
276         /*
277          * If an exception is encountered, processing resumes here.
278          *
279          * See notes in postgres.c about the design of this coding.
280          */
281         if (sigsetjmp(local_sigjmp_buf, 1) != 0)
282         {
283                 /* Since not using PG_TRY, must reset error stack by hand */
284                 error_context_stack = NULL;
285
286                 /* Prevent interrupts while cleaning up */
287                 HOLD_INTERRUPTS();
288
289                 /* Report the error to the server log */
290                 EmitErrorReport();
291
292                 /*
293                  * These operations are really just a minimal subset of
294                  * AbortTransaction().  We don't have very many resources to worry
295                  * about in bgwriter, but we do have LWLocks, buffers, and temp files.
296                  */
297                 LWLockReleaseAll();
298                 AbortBufferIO();
299                 UnlockBuffers();
300                 /* buffer pins are released here: */
301                 ResourceOwnerRelease(CurrentResourceOwner,
302                                                          RESOURCE_RELEASE_BEFORE_LOCKS,
303                                                          false, true);
304                 /* we needn't bother with the other ResourceOwnerRelease phases */
305                 AtEOXact_Buffers(false);
306                 AtEOXact_Files();
307                 AtEOXact_HashTables(false);
308
309                 /* Warn any waiting backends that the checkpoint failed. */
310                 if (ckpt_active)
311                 {
312                         /* use volatile pointer to prevent code rearrangement */
313                         volatile BgWriterShmemStruct *bgs = BgWriterShmem;
314
315                         SpinLockAcquire(&bgs->ckpt_lck);
316                         bgs->ckpt_failed++;
317                         bgs->ckpt_done = bgs->ckpt_started;
318                         SpinLockRelease(&bgs->ckpt_lck);
319
320                         ckpt_active = false;
321                 }
322
323                 /*
324                  * Now return to normal top-level context and clear ErrorContext for
325                  * next time.
326                  */
327                 MemoryContextSwitchTo(bgwriter_context);
328                 FlushErrorState();
329
330                 /* Flush any leaked data in the top-level context */
331                 MemoryContextResetAndDeleteChildren(bgwriter_context);
332
333                 /* Now we can allow interrupts again */
334                 RESUME_INTERRUPTS();
335
336                 /*
337                  * Sleep at least 1 second after any error.  A write error is likely
338                  * to be repeated, and we don't want to be filling the error logs as
339                  * fast as we can.
340                  */
341                 pg_usleep(1000000L);
342
343                 /*
344                  * Close all open files after any error.  This is helpful on Windows,
345                  * where holding deleted files open causes various strange errors.
346                  * It's not clear we need it elsewhere, but shouldn't hurt.
347                  */
348                 smgrcloseall();
349         }
350
351         /* We can now handle ereport(ERROR) */
352         PG_exception_stack = &local_sigjmp_buf;
353
354         /*
355          * Unblock signals (they were blocked when the postmaster forked us)
356          */
357         PG_SETMASK(&UnBlockSig);
358
359         /*
360          * Loop forever
361          */
362         for (;;)
363         {
364                 bool            do_checkpoint = false;
365                 int                     flags = 0;
366                 pg_time_t       now;
367                 int                     elapsed_secs;
368
369                 /*
370                  * Emergency bailout if postmaster has died.  This is to avoid the
371                  * necessity for manual cleanup of all postmaster children.
372                  */
373                 if (!PostmasterIsAlive(true))
374                         exit(1);
375
376                 /*
377                  * Process any requests or signals received recently.
378                  */
379                 AbsorbFsyncRequests();
380
381                 if (got_SIGHUP)
382                 {
383                         got_SIGHUP = false;
384                         ProcessConfigFile(PGC_SIGHUP);
385                 }
386                 if (checkpoint_requested)
387                 {
388                         checkpoint_requested = false;
389                         do_checkpoint = true;
390                         BgWriterStats.m_requested_checkpoints++;
391                 }
392                 if (shutdown_requested)
393                 {
394                         /*
395                          * From here on, elog(ERROR) should end with exit(1), not send
396                          * control back to the sigsetjmp block above
397                          */
398                         ExitOnAnyError = true;
399                         /* Close down the database */
400                         ShutdownXLOG(0, 0);
401                         /* Normal exit from the bgwriter is here */
402                         proc_exit(0);           /* done */
403                 }
404
405                 /*
406                  * Force a checkpoint if too much time has elapsed since the last one.
407                  * Note that we count a timed checkpoint in stats only when this
408                  * occurs without an external request, but we set the CAUSE_TIME flag
409                  * bit even if there is also an external request.
410                  */
411                 now = (pg_time_t) time(NULL);
412                 elapsed_secs = now - last_checkpoint_time;
413                 if (elapsed_secs >= CheckPointTimeout)
414                 {
415                         if (!do_checkpoint)
416                                 BgWriterStats.m_timed_checkpoints++;
417                         do_checkpoint = true;
418                         flags |= CHECKPOINT_CAUSE_TIME;
419                 }
420
421                 /*
422                  * Do a checkpoint if requested, otherwise do one cycle of
423                  * dirty-buffer writing.
424                  */
425                 if (do_checkpoint)
426                 {
427                         bool    ckpt_performed = false;
428                         bool    do_restartpoint;
429
430                         /* use volatile pointer to prevent code rearrangement */
431                         volatile BgWriterShmemStruct *bgs = BgWriterShmem;
432
433                         /*
434                          * Check if we should perform a checkpoint or a restartpoint.
435                          * As a side-effect, RecoveryInProgress() initializes
436                          * TimeLineID if it's not set yet.
437                          */
438                         do_restartpoint = RecoveryInProgress();
439
440                         /*
441                          * Atomically fetch the request flags to figure out what kind of a
442                          * checkpoint we should perform, and increase the started-counter
443                          * to acknowledge that we've started a new checkpoint.
444                          */
445                         SpinLockAcquire(&bgs->ckpt_lck);
446                         flags |= bgs->ckpt_flags;
447                         bgs->ckpt_flags = 0;
448                         bgs->ckpt_started++;
449                         SpinLockRelease(&bgs->ckpt_lck);
450
451                         /*
452                          * We will warn if (a) too soon since last checkpoint (whatever
453                          * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
454                          * since the last checkpoint start.  Note in particular that this
455                          * implementation will not generate warnings caused by
456                          * CheckPointTimeout < CheckPointWarning.
457                          */
458                         if (!do_restartpoint &&
459                                 (flags & CHECKPOINT_CAUSE_XLOG) &&
460                                 elapsed_secs < CheckPointWarning)
461                                 ereport(LOG,
462                                                 (errmsg(ngettext("checkpoints are occurring too frequently (%d second apart)",
463                                                                                  "checkpoints are occurring too frequently (%d seconds apart)",
464                                                                                  elapsed_secs),
465                                                                 elapsed_secs),
466                                                  errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
467
468                         /*
469                          * Initialize bgwriter-private variables used during checkpoint.
470                          */
471                         ckpt_active = true;
472                         if (!do_restartpoint)
473                                 ckpt_start_recptr = GetInsertRecPtr();
474                         ckpt_start_time = now;
475                         ckpt_cached_elapsed = 0;
476
477                         /*
478                          * Do the checkpoint.
479                          */
480                         if (!do_restartpoint)
481                         {
482                                 CreateCheckPoint(flags);
483                                 ckpt_performed = true;
484                         }
485                         else
486                                 ckpt_performed = CreateRestartPoint(flags);
487
488                         /*
489                          * After any checkpoint, close all smgr files.  This is so we
490                          * won't hang onto smgr references to deleted files indefinitely.
491                          */
492                         smgrcloseall();
493
494                         /*
495                          * Indicate checkpoint completion to any waiting backends.
496                          */
497                         SpinLockAcquire(&bgs->ckpt_lck);
498                         bgs->ckpt_done = bgs->ckpt_started;
499                         SpinLockRelease(&bgs->ckpt_lck);
500
501                         if (ckpt_performed)
502                         {
503                                 /*
504                                  * Note we record the checkpoint start time not end time as
505                                  * last_checkpoint_time.  This is so that time-driven
506                                  * checkpoints happen at a predictable spacing.
507                                  */
508                                 last_checkpoint_time = now;
509                         }
510                         else
511                         {
512                                 /*
513                                  * We were not able to perform the restartpoint (checkpoints
514                                  * throw an ERROR in case of error).  Most likely because we
515                                  * have not received any new checkpoint WAL records since the
516                                  * last restartpoint. Try again in 15 s.
517                                  */
518                                 last_checkpoint_time = now - CheckPointTimeout + 15;
519                         }
520
521                         ckpt_active = false;
522                 }
523                 else
524                         BgBufferSync();
525
526                 /* Check for archive_timeout and switch xlog files if necessary. */
527                 CheckArchiveTimeout();
528
529                 /* Nap for the configured time. */
530                 BgWriterNap();
531         }
532 }
533
534 /*
535  * CheckArchiveTimeout -- check for archive_timeout and switch xlog files
536  *              if needed
537  */
538 static void
539 CheckArchiveTimeout(void)
540 {
541         pg_time_t       now;
542         pg_time_t       last_time;
543
544         if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
545                 return;
546
547         now = (pg_time_t) time(NULL);
548
549         /* First we do a quick check using possibly-stale local state. */
550         if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
551                 return;
552
553         /*
554          * Update local state ... note that last_xlog_switch_time is the last time
555          * a switch was performed *or requested*.
556          */
557         last_time = GetLastSegSwitchTime();
558
559         last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
560
561         /* Now we can do the real check */
562         if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
563         {
564                 XLogRecPtr      switchpoint;
565
566                 /* OK, it's time to switch */
567                 switchpoint = RequestXLogSwitch();
568
569                 /*
570                  * If the returned pointer points exactly to a segment boundary,
571                  * assume nothing happened.
572                  */
573                 if ((switchpoint.xrecoff % XLogSegSize) != 0)
574                         ereport(DEBUG1,
575                                 (errmsg("transaction log switch forced (archive_timeout=%d)",
576                                                 XLogArchiveTimeout)));
577
578                 /*
579                  * Update state in any case, so we don't retry constantly when the
580                  * system is idle.
581                  */
582                 last_xlog_switch_time = now;
583         }
584 }
585
586 /*
587  * BgWriterNap -- Nap for the configured time or until a signal is received.
588  */
589 static void
590 BgWriterNap(void)
591 {
592         long            udelay;
593
594         /*
595          * Send off activity statistics to the stats collector
596          */
597         pgstat_send_bgwriter();
598
599         /*
600          * Nap for the configured time, or sleep for 10 seconds if there is no
601          * bgwriter activity configured.
602          *
603          * On some platforms, signals won't interrupt the sleep.  To ensure we
604          * respond reasonably promptly when someone signals us, break down the
605          * sleep into 1-second increments, and check for interrupts after each
606          * nap.
607          *
608          * We absorb pending requests after each short sleep.
609          */
610         if (bgwriter_lru_maxpages > 0 || ckpt_active)
611                 udelay = BgWriterDelay * 1000L;
612         else if (XLogArchiveTimeout > 0)
613                 udelay = 1000000L;              /* One second */
614         else
615                 udelay = 10000000L;             /* Ten seconds */
616
617         while (udelay > 999999L)
618         {
619                 if (got_SIGHUP || shutdown_requested ||
620                 (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
621                         break;
622                 pg_usleep(1000000L);
623                 AbsorbFsyncRequests();
624                 udelay -= 1000000L;
625         }
626
627         if (!(got_SIGHUP || shutdown_requested ||
628           (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)))
629                 pg_usleep(udelay);
630 }
631
632 /*
633  * Returns true if an immediate checkpoint request is pending.  (Note that
634  * this does not check the *current* checkpoint's IMMEDIATE flag, but whether
635  * there is one pending behind it.)
636  */
637 static bool
638 ImmediateCheckpointRequested(void)
639 {
640         if (checkpoint_requested)
641         {
642                 volatile BgWriterShmemStruct *bgs = BgWriterShmem;
643
644                 /*
645                  * We don't need to acquire the ckpt_lck in this case because we're
646                  * only looking at a single flag bit.
647                  */
648                 if (bgs->ckpt_flags & CHECKPOINT_IMMEDIATE)
649                         return true;
650         }
651         return false;
652 }
653
654 /*
655  * CheckpointWriteDelay -- yield control to bgwriter during a checkpoint
656  *
657  * This function is called after each page write performed by BufferSync().
658  * It is responsible for keeping the bgwriter's normal activities in
659  * progress during a long checkpoint, and for throttling BufferSync()'s
660  * write rate to hit checkpoint_completion_target.
661  *
662  * The checkpoint request flags should be passed in; currently the only one
663  * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
664  *
665  * 'progress' is an estimate of how much of the work has been done, as a
666  * fraction between 0.0 meaning none, and 1.0 meaning all done.
667  */
668 void
669 CheckpointWriteDelay(int flags, double progress)
670 {
671         static int      absorb_counter = WRITES_PER_ABSORB;
672
673         /* Do nothing if checkpoint is being executed by non-bgwriter process */
674         if (!am_bg_writer)
675                 return;
676
677         /*
678          * Perform the usual bgwriter duties and take a nap, unless we're behind
679          * schedule, in which case we just try to catch up as quickly as possible.
680          */
681         if (!(flags & CHECKPOINT_IMMEDIATE) &&
682                 !shutdown_requested &&
683                 !ImmediateCheckpointRequested() &&
684                 IsCheckpointOnSchedule(progress))
685         {
686                 if (got_SIGHUP)
687                 {
688                         got_SIGHUP = false;
689                         ProcessConfigFile(PGC_SIGHUP);
690                 }
691
692                 AbsorbFsyncRequests();
693                 absorb_counter = WRITES_PER_ABSORB;
694
695                 BgBufferSync();
696                 CheckArchiveTimeout();
697                 BgWriterNap();
698         }
699         else if (--absorb_counter <= 0)
700         {
701                 /*
702                  * Absorb pending fsync requests after each WRITES_PER_ABSORB write
703                  * operations even when we don't sleep, to prevent overflow of the
704                  * fsync request queue.
705                  */
706                 AbsorbFsyncRequests();
707                 absorb_counter = WRITES_PER_ABSORB;
708         }
709 }
710
711 /*
712  * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint
713  *               in time?
714  *
715  * Compares the current progress against the time/segments elapsed since last
716  * checkpoint, and returns true if the progress we've made this far is greater
717  * than the elapsed time/segments.
718  */
719 static bool
720 IsCheckpointOnSchedule(double progress)
721 {
722         XLogRecPtr      recptr;
723         struct timeval now;
724         double          elapsed_xlogs,
725                                 elapsed_time;
726
727         Assert(ckpt_active);
728
729         /* Scale progress according to checkpoint_completion_target. */
730         progress *= CheckPointCompletionTarget;
731
732         /*
733          * Check against the cached value first. Only do the more expensive
734          * calculations once we reach the target previously calculated. Since
735          * neither time or WAL insert pointer moves backwards, a freshly
736          * calculated value can only be greater than or equal to the cached value.
737          */
738         if (progress < ckpt_cached_elapsed)
739                 return false;
740
741         /*
742          * Check progress against WAL segments written and checkpoint_segments.
743          *
744          * We compare the current WAL insert location against the location
745          * computed before calling CreateCheckPoint. The code in XLogInsert that
746          * actually triggers a checkpoint when checkpoint_segments is exceeded
747          * compares against RedoRecptr, so this is not completely accurate.
748          * However, it's good enough for our purposes, we're only calculating an
749          * estimate anyway.
750          */
751         if (!RecoveryInProgress())
752         {
753                 recptr = GetInsertRecPtr();
754                 elapsed_xlogs =
755                         (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
756                          ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
757                         CheckPointSegments;
758
759                 if (progress < elapsed_xlogs)
760                 {
761                         ckpt_cached_elapsed = elapsed_xlogs;
762                         return false;
763                 }
764         }
765
766         /*
767          * Check progress against time elapsed and checkpoint_timeout.
768          */
769         gettimeofday(&now, NULL);
770         elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) +
771                                         now.tv_usec / 1000000.0) / CheckPointTimeout;
772
773         if (progress < elapsed_time)
774         {
775                 ckpt_cached_elapsed = elapsed_time;
776                 return false;
777         }
778
779         /* It looks like we're on schedule. */
780         return true;
781 }
782
783
784 /* --------------------------------
785  *              signal handler routines
786  * --------------------------------
787  */
788
789 /*
790  * bg_quickdie() occurs when signalled SIGQUIT by the postmaster.
791  *
792  * Some backend has bought the farm,
793  * so we need to stop what we're doing and exit.
794  */
795 static void
796 bg_quickdie(SIGNAL_ARGS)
797 {
798         PG_SETMASK(&BlockSig);
799
800         /*
801          * DO NOT proc_exit() -- we're here because shared memory may be
802          * corrupted, so we don't want to try to clean up our transaction. Just
803          * nail the windows shut and get out of town.
804          *
805          * Note we do exit(2) not exit(0).      This is to force the postmaster into a
806          * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
807          * backend.  This is necessary precisely because we don't clean up our
808          * shared memory state.
809          */
810         exit(2);
811 }
812
813 /* SIGHUP: set flag to re-read config file at next convenient time */
814 static void
815 BgSigHupHandler(SIGNAL_ARGS)
816 {
817         got_SIGHUP = true;
818 }
819
820 /* SIGINT: set flag to run a normal checkpoint right away */
821 static void
822 ReqCheckpointHandler(SIGNAL_ARGS)
823 {
824         checkpoint_requested = true;
825 }
826
827 /* SIGUSR2: set flag to run a shutdown checkpoint and exit */
828 static void
829 ReqShutdownHandler(SIGNAL_ARGS)
830 {
831         shutdown_requested = true;
832 }
833
834
835 /* --------------------------------
836  *              communication with backends
837  * --------------------------------
838  */
839
840 /*
841  * BgWriterShmemSize
842  *              Compute space needed for bgwriter-related shared memory
843  */
844 Size
845 BgWriterShmemSize(void)
846 {
847         Size            size;
848
849         /*
850          * Currently, the size of the requests[] array is arbitrarily set equal to
851          * NBuffers.  This may prove too large or small ...
852          */
853         size = offsetof(BgWriterShmemStruct, requests);
854         size = add_size(size, mul_size(NBuffers, sizeof(BgWriterRequest)));
855
856         return size;
857 }
858
859 /*
860  * BgWriterShmemInit
861  *              Allocate and initialize bgwriter-related shared memory
862  */
863 void
864 BgWriterShmemInit(void)
865 {
866         bool            found;
867
868         BgWriterShmem = (BgWriterShmemStruct *)
869                 ShmemInitStruct("Background Writer Data",
870                                                 BgWriterShmemSize(),
871                                                 &found);
872         if (BgWriterShmem == NULL)
873                 ereport(FATAL,
874                                 (errcode(ERRCODE_OUT_OF_MEMORY),
875                                  errmsg("not enough shared memory for background writer")));
876         if (found)
877                 return;                                 /* already initialized */
878
879         MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct));
880         SpinLockInit(&BgWriterShmem->ckpt_lck);
881         BgWriterShmem->max_requests = NBuffers;
882 }
883
884 /*
885  * RequestCheckpoint
886  *              Called in backend processes to request a checkpoint
887  *
888  * flags is a bitwise OR of the following:
889  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
890  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
891  *              ignoring checkpoint_completion_target parameter.
892  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
893  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
894  *      CHECKPOINT_WAIT: wait for completion before returning (otherwise,
895  *              just signal bgwriter to do it, and return).
896  *      CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling.
897  *              (This affects logging, and in particular enables CheckPointWarning.)
898  */
899 void
900 RequestCheckpoint(int flags)
901 {
902         /* use volatile pointer to prevent code rearrangement */
903         volatile BgWriterShmemStruct *bgs = BgWriterShmem;
904         int                     ntries;
905         int                     old_failed,
906                                 old_started;
907
908         /*
909          * If in a standalone backend, just do it ourselves.
910          */
911         if (!IsPostmasterEnvironment)
912         {
913                 /*
914                  * There's no point in doing slow checkpoints in a standalone backend,
915                  * because there's no other backends the checkpoint could disrupt.
916                  */
917                 CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE);
918
919                 /*
920                  * After any checkpoint, close all smgr files.  This is so we won't
921                  * hang onto smgr references to deleted files indefinitely.
922                  */
923                 smgrcloseall();
924
925                 return;
926         }
927
928         /*
929          * Atomically set the request flags, and take a snapshot of the counters.
930          * When we see ckpt_started > old_started, we know the flags we set here
931          * have been seen by bgwriter.
932          *
933          * Note that we OR the flags with any existing flags, to avoid overriding
934          * a "stronger" request by another backend.  The flag senses must be
935          * chosen to make this work!
936          */
937         SpinLockAcquire(&bgs->ckpt_lck);
938
939         old_failed = bgs->ckpt_failed;
940         old_started = bgs->ckpt_started;
941         bgs->ckpt_flags |= flags;
942
943         SpinLockRelease(&bgs->ckpt_lck);
944
945         /*
946          * Send signal to request checkpoint.  It's possible that the bgwriter
947          * hasn't started yet, or is in process of restarting, so we will retry
948          * a few times if needed.  Also, if not told to wait for the checkpoint
949          * to occur, we consider failure to send the signal to be nonfatal and
950          * merely LOG it.
951          */
952         for (ntries = 0; ; ntries++)
953         {
954                 if (BgWriterShmem->bgwriter_pid == 0)
955                 {
956                         if (ntries >= 20)               /* max wait 2.0 sec */
957                         {
958                                 elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
959                                          "could not request checkpoint because bgwriter not running");
960                                 break;
961                         }
962                 }
963                 else if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
964                 {
965                         if (ntries >= 20)               /* max wait 2.0 sec */
966                         {
967                                 elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
968                                          "could not signal for checkpoint: %m");
969                                 break;
970                         }
971                 }
972                 else
973                         break;                          /* signal sent successfully */
974
975                 CHECK_FOR_INTERRUPTS();
976                 pg_usleep(100000L);             /* wait 0.1 sec, then retry */
977         }
978
979         /*
980          * If requested, wait for completion.  We detect completion according to
981          * the algorithm given above.
982          */
983         if (flags & CHECKPOINT_WAIT)
984         {
985                 int                     new_started,
986                                         new_failed;
987
988                 /* Wait for a new checkpoint to start. */
989                 for (;;)
990                 {
991                         SpinLockAcquire(&bgs->ckpt_lck);
992                         new_started = bgs->ckpt_started;
993                         SpinLockRelease(&bgs->ckpt_lck);
994
995                         if (new_started != old_started)
996                                 break;
997
998                         CHECK_FOR_INTERRUPTS();
999                         pg_usleep(100000L);
1000                 }
1001
1002                 /*
1003                  * We are waiting for ckpt_done >= new_started, in a modulo sense.
1004                  */
1005                 for (;;)
1006                 {
1007                         int                     new_done;
1008
1009                         SpinLockAcquire(&bgs->ckpt_lck);
1010                         new_done = bgs->ckpt_done;
1011                         new_failed = bgs->ckpt_failed;
1012                         SpinLockRelease(&bgs->ckpt_lck);
1013
1014                         if (new_done - new_started >= 0)
1015                                 break;
1016
1017                         CHECK_FOR_INTERRUPTS();
1018                         pg_usleep(100000L);
1019                 }
1020
1021                 if (new_failed != old_failed)
1022                         ereport(ERROR,
1023                                         (errmsg("checkpoint request failed"),
1024                                          errhint("Consult recent messages in the server log for details.")));
1025         }
1026 }
1027
1028 /*
1029  * ForwardFsyncRequest
1030  *              Forward a file-fsync request from a backend to the bgwriter
1031  *
1032  * Whenever a backend is compelled to write directly to a relation
1033  * (which should be seldom, if the bgwriter is getting its job done),
1034  * the backend calls this routine to pass over knowledge that the relation
1035  * is dirty and must be fsync'd before next checkpoint.  We also use this
1036  * opportunity to count such writes for statistical purposes.
1037  *
1038  * segno specifies which segment (not block!) of the relation needs to be
1039  * fsync'd.  (Since the valid range is much less than BlockNumber, we can
1040  * use high values for special flags; that's all internal to md.c, which
1041  * see for details.)
1042  *
1043  * If we are unable to pass over the request (at present, this can happen
1044  * if the shared memory queue is full), we return false.  That forces
1045  * the backend to do its own fsync.  We hope that will be even more seldom.
1046  *
1047  * Note: we presently make no attempt to eliminate duplicate requests
1048  * in the requests[] queue.  The bgwriter will have to eliminate dups
1049  * internally anyway, so we may as well avoid holding the lock longer
1050  * than we have to here.
1051  */
1052 bool
1053 ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
1054 {
1055         BgWriterRequest *request;
1056
1057         if (!IsUnderPostmaster)
1058                 return false;                   /* probably shouldn't even get here */
1059
1060         if (am_bg_writer)
1061                 elog(ERROR, "ForwardFsyncRequest must not be called in bgwriter");
1062
1063         LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
1064
1065         /* we count non-bgwriter writes even when the request queue overflows */
1066         BgWriterShmem->num_backend_writes++;
1067
1068         if (BgWriterShmem->bgwriter_pid == 0 ||
1069                 BgWriterShmem->num_requests >= BgWriterShmem->max_requests)
1070         {
1071                 LWLockRelease(BgWriterCommLock);
1072                 return false;
1073         }
1074         request = &BgWriterShmem->requests[BgWriterShmem->num_requests++];
1075         request->rnode = rnode;
1076         request->forknum = forknum;
1077         request->segno = segno;
1078         LWLockRelease(BgWriterCommLock);
1079         return true;
1080 }
1081
1082 /*
1083  * AbsorbFsyncRequests
1084  *              Retrieve queued fsync requests and pass them to local smgr.
1085  *
1086  * This is exported because it must be called during CreateCheckPoint;
1087  * we have to be sure we have accepted all pending requests just before
1088  * we start fsync'ing.  Since CreateCheckPoint sometimes runs in
1089  * non-bgwriter processes, do nothing if not bgwriter.
1090  */
1091 void
1092 AbsorbFsyncRequests(void)
1093 {
1094         BgWriterRequest *requests = NULL;
1095         BgWriterRequest *request;
1096         int                     n;
1097
1098         if (!am_bg_writer)
1099                 return;
1100
1101         /*
1102          * We have to PANIC if we fail to absorb all the pending requests (eg,
1103          * because our hashtable runs out of memory).  This is because the system
1104          * cannot run safely if we are unable to fsync what we have been told to
1105          * fsync.  Fortunately, the hashtable is so small that the problem is
1106          * quite unlikely to arise in practice.
1107          */
1108         START_CRIT_SECTION();
1109
1110         /*
1111          * We try to avoid holding the lock for a long time by copying the request
1112          * array.
1113          */
1114         LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
1115
1116         /* Transfer write count into pending pgstats message */
1117         BgWriterStats.m_buf_written_backend += BgWriterShmem->num_backend_writes;
1118         BgWriterShmem->num_backend_writes = 0;
1119
1120         n = BgWriterShmem->num_requests;
1121         if (n > 0)
1122         {
1123                 requests = (BgWriterRequest *) palloc(n * sizeof(BgWriterRequest));
1124                 memcpy(requests, BgWriterShmem->requests, n * sizeof(BgWriterRequest));
1125         }
1126         BgWriterShmem->num_requests = 0;
1127
1128         LWLockRelease(BgWriterCommLock);
1129
1130         for (request = requests; n > 0; request++, n--)
1131                 RememberFsyncRequest(request->rnode, request->forknum, request->segno);
1132
1133         if (requests)
1134                 pfree(requests);
1135
1136         END_CRIT_SECTION();
1137 }