Create a new dedicated Postgres process, "wal writer", which exists to write
authorTom Lane <tgl@sss.pgh.pa.us>
Tue, 24 Jul 2007 04:54:09 +0000 (04:54 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Tue, 24 Jul 2007 04:54:09 +0000 (04:54 +0000)
and fsync WAL at convenient intervals.  For the moment it just tries to
offload this work from backends, but soon it will be responsible for
guaranteeing a maximum delay before asynchronously-committed transactions
will be flushed to disk.

This is a portion of Simon Riggs' async-commit patch, committed to CVS
separately because a background WAL writer seems like it might be a good idea
independently of the async-commit feature.  I rebased walwriter.c on
bgwriter.c because it seemed like a more appropriate way of handling signals;
while the startup/shutdown logic in postmaster.c is more like autovac because
we want walwriter to quit before we start the shutdown checkpoint.

doc/src/sgml/config.sgml
src/backend/access/transam/xlog.c
src/backend/bootstrap/bootstrap.c
src/backend/postmaster/Makefile
src/backend/postmaster/postmaster.c
src/backend/postmaster/walwriter.c [new file with mode: 0644]
src/backend/utils/misc/guc.c
src/backend/utils/misc/postgresql.conf.sample
src/include/access/xlog.h
src/include/bootstrap/bootstrap.h
src/include/postmaster/walwriter.h [new file with mode: 0644]

index a3331bdef6eb1fd686e076af8265d64ff6e40ab9..0e49ba321780d07a3e30d9f5260be1810d2472cd 100644 (file)
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.132 2007/07/24 01:53:55 alvherre Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.133 2007/07/24 04:54:08 tgl Exp $ -->
 
 <chapter Id="runtime-config">
   <title>Server Configuration</title>
@@ -1413,7 +1413,7 @@ SET ENABLE_SEQSCAN TO OFF;
        </para>
       </listitem>
      </varlistentry>
-     
+
      <varlistentry id="guc-wal-buffers" xreflabel="wal_buffers">
       <term><varname>wal_buffers</varname> (<type>integer</type>)</term>
       <indexterm>
@@ -1438,7 +1438,27 @@ SET ENABLE_SEQSCAN TO OFF;
        </para>
       </listitem>
      </varlistentry>
-                
+
+     <varlistentry id="guc-wal-writer-delay" xreflabel="wal_writer_delay">
+      <term><varname>wal_writer_delay</varname> (<type>integer</type>)</term>
+      <indexterm>
+       <primary><varname>wal_writer_delay</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Specifies the delay between activity rounds for the WAL writer.
+        In each round the writer will flush WAL to disk. It then sleeps for
+        <varname>wal_writer_delay</> milliseconds, and repeats.  The default
+        value is 200 milliseconds (<literal>200ms</>).  Note that on many
+        systems, the effective resolution of sleep delays is 10 milliseconds;
+        setting <varname>wal_writer_delay</> to a value that is not a multiple
+        of 10 might have the same results as setting it to the next higher
+        multiple of 10. This parameter can only be set in the
+        <filename>postgresql.conf</> file or on the server command line.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-commit-delay" xreflabel="commit_delay">
       <term><varname>commit_delay</varname> (<type>integer</type>)</term>
       <indexterm>
@@ -1521,7 +1541,7 @@ SET ENABLE_SEQSCAN TO OFF;
       </indexterm>
       <listitem>
        <para>
-        Specifies the target length of checkpoints, as a fraction of 
+        Specifies the target length of checkpoints, as a fraction of
         the checkpoint interval. The default is 0.5.
 
         This parameter can only be set in the <filename>postgresql.conf</>
index 15c9f310a63a57611f983d1b25319b9b9c5eaf2f..25789ddaa68bfb752753d0c7ab6863fdd7c5f148 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.274 2007/06/30 19:12:01 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.275 2007/07/24 04:54:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -484,7 +484,6 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
        uint32          len,
                                write_len;
        unsigned        i;
-       XLogwrtRqst LogwrtRqst;
        bool            updrqst;
        bool            doPageWrites;
        bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
@@ -643,43 +642,6 @@ begin:;
 
        START_CRIT_SECTION();
 
-       /* update LogwrtResult before doing cache fill check */
-       {
-               /* use volatile pointer to prevent code rearrangement */
-               volatile XLogCtlData *xlogctl = XLogCtl;
-
-               SpinLockAcquire(&xlogctl->info_lck);
-               LogwrtRqst = xlogctl->LogwrtRqst;
-               LogwrtResult = xlogctl->LogwrtResult;
-               SpinLockRelease(&xlogctl->info_lck);
-       }
-
-       /*
-        * If cache is half filled then try to acquire write lock and do
-        * XLogWrite. Ignore any fractional blocks in performing this check.
-        */
-       LogwrtRqst.Write.xrecoff -= LogwrtRqst.Write.xrecoff % XLOG_BLCKSZ;
-       if (LogwrtRqst.Write.xlogid != LogwrtResult.Write.xlogid ||
-               (LogwrtRqst.Write.xrecoff >= LogwrtResult.Write.xrecoff +
-                XLogCtl->XLogCacheByte / 2))
-       {
-               if (LWLockConditionalAcquire(WALWriteLock, LW_EXCLUSIVE))
-               {
-                       /*
-                        * Since the amount of data we write here is completely optional
-                        * anyway, tell XLogWrite it can be "flexible" and stop at a
-                        * convenient boundary.  This allows writes triggered by this
-                        * mechanism to synchronize with the cache boundaries, so that in
-                        * a long transaction we'll basically dump alternating halves of
-                        * the buffer array.
-                        */
-                       LogwrtResult = XLogCtl->Write.LogwrtResult;
-                       if (XLByteLT(LogwrtResult.Write, LogwrtRqst.Write))
-                               XLogWrite(LogwrtRqst, true, false);
-                       LWLockRelease(WALWriteLock);
-               }
-       }
-
        /* Now wait to get insert lock */
        LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 
@@ -1800,6 +1762,85 @@ XLogFlush(XLogRecPtr record)
                         LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
 }
 
+/*
+ * Flush xlog, but without specifying exactly where to flush to.
+ *
+ * We normally flush only completed blocks; but if there is nothing to do on
+ * that basis, we check for unflushed async commits in the current incomplete
+ * block, and flush through the latest one of those.  Thus, if async commits
+ * are not being used, we will flush complete blocks only.  We can guarantee
+ * that async commits reach disk after at most three cycles; normally only
+ * one or two.  (We allow XLogWrite to write "flexibly", meaning it can stop
+ * at the end of the buffer ring; this makes a difference only with very high
+ * load or long wal_writer_delay, but imposes one extra cycle for the worst
+ * case for async commits.)
+ *
+ * This routine is invoked periodically by the background walwriter process.
+ */
+void
+XLogBackgroundFlush(void)
+{
+       XLogRecPtr      WriteRqstPtr;
+       bool            flexible = true;
+
+       /* read LogwrtResult and update local state */
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+
+               SpinLockAcquire(&xlogctl->info_lck);
+               LogwrtResult = xlogctl->LogwrtResult;
+               WriteRqstPtr = xlogctl->LogwrtRqst.Write;
+               SpinLockRelease(&xlogctl->info_lck);
+       }
+
+       /* back off to last completed page boundary */
+       WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
+
+#ifdef NOT_YET                                 /* async commit patch is still to come */
+       /* if we have already flushed that far, consider async commit records */
+       if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+
+               SpinLockAcquire(&xlogctl->async_commit_lck);
+               WriteRqstPtr = xlogctl->asyncCommitLSN;
+               SpinLockRelease(&xlogctl->async_commit_lck);
+               flexible = false;               /* ensure it all gets written */
+       }
+#endif
+
+       /* Done if already known flushed */
+       if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
+               return;
+
+#ifdef WAL_DEBUG
+       if (XLOG_DEBUG)
+               elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
+                        WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
+                        LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
+                        LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
+#endif
+
+       START_CRIT_SECTION();
+
+       /* now wait for the write lock */
+       LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
+       LogwrtResult = XLogCtl->Write.LogwrtResult;
+       if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
+       {
+               XLogwrtRqst WriteRqst;
+
+               WriteRqst.Write = WriteRqstPtr;
+               WriteRqst.Flush = WriteRqstPtr;
+               XLogWrite(WriteRqst, flexible, false);
+       }
+       LWLockRelease(WALWriteLock);
+
+       END_CRIT_SECTION();
+}
+
 /*
  * Test whether XLOG data has been flushed up to (at least) the given position.
  *
index 78eb6797db43883893f5417f0cd158fb704bdd16..3ffff2a2cce3fe0ee2bb50fd079a758bcf2b747b 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.234 2007/06/28 00:02:37 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.235 2007/07/24 04:54:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -30,6 +30,7 @@
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/walwriter.h"
 #include "storage/freespace.h"
 #include "storage/ipc.h"
 #include "storage/proc.h"
@@ -195,7 +196,7 @@ static IndexList *ILHead = NULL;
  *      AuxiliaryProcessMain
  *
  *      The main entry point for auxiliary processes, such as the bgwriter,
- *      bootstrapper and the shared memory checker code.
+ *      walwriter, bootstrapper and the shared memory checker code.
  *
  *      This code is here just because of historical reasons.
  */
@@ -331,6 +332,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
                        case BgWriterProcess:
                                statmsg = "writer process";
                                break;
+                       case WalWriterProcess:
+                               statmsg = "wal writer process";
+                               break;
                        default:
                                statmsg = "??? process";
                                break;
@@ -419,6 +423,12 @@ AuxiliaryProcessMain(int argc, char *argv[])
                        InitXLOGAccess();
                        BackgroundWriterMain();
                        proc_exit(1);           /* should never return */
+
+               case WalWriterProcess:
+                       /* don't set signals, walwriter has its own agenda */
+                       InitXLOGAccess();
+                       WalWriterMain();
+                       proc_exit(1);           /* should never return */
                        
                default:
                        elog(PANIC, "unrecognized process type: %d", auxType);
index a49e0e393bfc6ffcb575697351589a2051e845d3..7ccba285f2149e173921e7e3a6e9debf1074ad7b 100644 (file)
@@ -4,7 +4,7 @@
 #    Makefile for src/backend/postmaster
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/postmaster/Makefile,v 1.22 2007/01/20 17:16:12 petere Exp $
+#    $PostgreSQL: pgsql/src/backend/postmaster/Makefile,v 1.23 2007/07/24 04:54:09 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -12,8 +12,8 @@ subdir = src/backend/postmaster
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = bgwriter.o autovacuum.o pgarch.o pgstat.o postmaster.o syslogger.o \
-       fork_process.o
+OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \
+       syslogger.o walwriter.o
 
 all: SUBSYS.o
 
index 7a1270b0149eb0b10e7b518d252b430b66463c68..f1f9effae77085b7ec2c23aa25e0d570e3a350ac 100644 (file)
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.534 2007/07/23 10:16:54 mha Exp $
+ *       $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.535 2007/07/24 04:54:09 tgl Exp $
  *
  * NOTES
  *
@@ -136,7 +136,7 @@ typedef struct bkend
 {
        pid_t           pid;                    /* process id of backend */
        long            cancel_key;             /* cancel key for cancels for this backend */
-       bool            is_autovacuum;  /* is it an autovacuum process */
+       bool            is_autovacuum;  /* is it an autovacuum process? */
 } Backend;
 
 static Dllist *BackendList;
@@ -144,9 +144,9 @@ static Dllist *BackendList;
 #ifdef EXEC_BACKEND
 /*
  * Number of entries in the backend table. Twice the number of backends,
- * plus four other subprocesses (stats, bgwriter, autovac, logger).
+ * plus five other subprocesses (stats, bgwriter, walwriter, autovac, logger).
  */
-#define NUM_BACKENDARRAY_ELEMS (2*MaxBackends + 4)
+#define NUM_BACKENDARRAY_ELEMS (2*MaxBackends + 5)
 static Backend *ShmemBackendArray;
 #endif
 
@@ -201,6 +201,7 @@ char           *bonjour_name;
 /* PIDs of special child processes; 0 when not running */
 static pid_t StartupPID = 0,
                        BgWriterPID = 0,
+                       WalWriterPID = 0,
                        AutoVacPID = 0,
                        PgArchPID = 0,
                PgStatPID = 0,
@@ -221,7 +222,7 @@ bool                ClientAuthInProgress = false;           /* T during new-client
 bool redirection_done = false; 
 
 /* received START_AUTOVAC_LAUNCHER signal */
-static bool start_autovac_launcher = false;
+static volatile sig_atomic_t start_autovac_launcher = false;
 
 /*
  * State for assigning random salts and cancel keys.
@@ -365,6 +366,7 @@ static void ShmemBackendArrayRemove(pid_t pid);
 
 #define StartupDataBase()              StartChildProcess(StartupProcess)
 #define StartBackgroundWriter() StartChildProcess(BgWriterProcess)
+#define StartWalWriter()               StartChildProcess(WalWriterProcess)
 
 /* Macros to check exit status of a child process */
 #define EXIT_STATUS_0(st)  ((st) == 0)
@@ -909,8 +911,9 @@ PostmasterMain(int argc, char *argv[])
         *
         * CAUTION: when changing this list, check for side-effects on the signal
         * handling setup of child processes.  See tcop/postgres.c,
-        * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/autovacuum.c,
-        * postmaster/pgarch.c, postmaster/pgstat.c, and postmaster/syslogger.c.
+        * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c,
+        * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, and
+        * postmaster/syslogger.c.
         */
        pqinitmask();
        PG_SETMASK(&BlockSig);
@@ -1244,6 +1247,15 @@ ServerLoop(void)
                                signal_child(BgWriterPID, SIGUSR2);
                }
 
+               /*
+                * Likewise, if we have lost the walwriter process, try to start a
+                * new one.  We don't need walwriter to complete a shutdown, so
+                * don't start it if shutdown already initiated.
+                */
+               if (WalWriterPID == 0 &&
+                       StartupPID == 0 && !FatalError && Shutdown == NoShutdown)
+                       WalWriterPID = StartWalWriter();
+
                /* If we have lost the autovacuum launcher, try to start a new one */
                if (AutoVacPID == 0 &&
                        (AutoVacuumingActive() || start_autovac_launcher) &&
@@ -1251,7 +1263,7 @@ ServerLoop(void)
                {
                        AutoVacPID = StartAutoVacLauncher();
                        if (AutoVacPID != 0)
-                               start_autovac_launcher = false; /* signal successfully processed */
+                               start_autovac_launcher = false; /* signal processed */
                }
 
                /* If we have lost the archiver, try to start a new one */
@@ -1842,6 +1854,8 @@ SIGHUP_handler(SIGNAL_ARGS)
                SignalChildren(SIGHUP);
                if (BgWriterPID != 0)
                        signal_child(BgWriterPID, SIGHUP);
+               if (WalWriterPID != 0)
+                       signal_child(WalWriterPID, SIGHUP);
                if (AutoVacPID != 0)
                        signal_child(AutoVacPID, SIGHUP);
                if (PgArchPID != 0)
@@ -1901,8 +1915,11 @@ pmdie(SIGNAL_ARGS)
                        /* and the autovac launcher too */
                        if (AutoVacPID != 0)
                                signal_child(AutoVacPID, SIGTERM);
+                       /* and the walwriter too */
+                       if (WalWriterPID != 0)
+                               signal_child(WalWriterPID, SIGTERM);
 
-                       if (DLGetHead(BackendList) || AutoVacPID != 0)
+                       if (DLGetHead(BackendList) || AutoVacPID != 0 || WalWriterPID != 0)
                                break;                  /* let reaper() handle this */
 
                        /*
@@ -1938,7 +1955,7 @@ pmdie(SIGNAL_ARGS)
                        ereport(LOG,
                                        (errmsg("received fast shutdown request")));
 
-                       if (DLGetHead(BackendList) || AutoVacPID != 0)
+                       if (DLGetHead(BackendList) || AutoVacPID != 0 || WalWriterPID != 0)
                        {
                                if (!FatalError)
                                {
@@ -1947,6 +1964,8 @@ pmdie(SIGNAL_ARGS)
                                        SignalChildren(SIGTERM);
                                        if (AutoVacPID != 0)
                                                signal_child(AutoVacPID, SIGTERM);
+                                       if (WalWriterPID != 0)
+                                               signal_child(WalWriterPID, SIGTERM);
                                        /* reaper() does the rest */
                                }
                                break;
@@ -1957,6 +1976,7 @@ pmdie(SIGNAL_ARGS)
                         *
                         * Note: if we previously got SIGTERM then we may send SIGUSR2 to
                         * the bgwriter a second time here.  This should be harmless.
+                        * Ditto for the signals to the other special children.
                         */
                        if (StartupPID != 0)
                        {
@@ -1993,6 +2013,8 @@ pmdie(SIGNAL_ARGS)
                                signal_child(StartupPID, SIGQUIT);
                        if (BgWriterPID != 0)
                                signal_child(BgWriterPID, SIGQUIT);
+                       if (WalWriterPID != 0)
+                               signal_child(WalWriterPID, SIGQUIT);
                        if (AutoVacPID != 0)
                                signal_child(AutoVacPID, SIGQUIT);
                        if (PgArchPID != 0)
@@ -2091,13 +2113,14 @@ reaper(SIGNAL_ARGS)
 
                        /*
                         * Go to shutdown mode if a shutdown request was pending.
-                        * Otherwise, try to start the archiver, stats collector and
-                        * autovacuum launcher.
+                        * Otherwise, try to start the other special children.
                         */
                        if (Shutdown > NoShutdown && BgWriterPID != 0)
                                signal_child(BgWriterPID, SIGUSR2);
                        else if (Shutdown == NoShutdown)
                        {
+                               if (WalWriterPID == 0)
+                                       WalWriterPID = StartWalWriter();
                                if (XLogArchivingActive() && PgArchPID == 0)
                                        PgArchPID = pgarch_start();
                                if (PgStatPID == 0)
@@ -2121,7 +2144,8 @@ reaper(SIGNAL_ARGS)
                        BgWriterPID = 0;
                        if (EXIT_STATUS_0(exitstatus) &&
                                Shutdown > NoShutdown && !FatalError &&
-                               !DLGetHead(BackendList) && AutoVacPID == 0)
+                               !DLGetHead(BackendList) &&
+                               WalWriterPID == 0 && AutoVacPID == 0)
                        {
                                /*
                                 * Normal postmaster exit is here: we've seen normal exit of
@@ -2150,7 +2174,8 @@ reaper(SIGNAL_ARGS)
                         * required will happen on next postmaster start.
                         */
                        if (Shutdown > NoShutdown &&
-                               !DLGetHead(BackendList) && AutoVacPID == 0)
+                               !DLGetHead(BackendList) &&
+                               WalWriterPID == 0 && AutoVacPID == 0)
                        {
                                ereport(LOG,
                                                (errmsg("abnormal database system shutdown")));
@@ -2161,6 +2186,20 @@ reaper(SIGNAL_ARGS)
                        continue;
                }
 
+               /*
+                * Was it the wal writer?  Normal exit can be ignored; we'll
+                * start a new one at the next iteration of the postmaster's main loop,
+                * if necessary.  Any other exit condition is treated as a crash.
+                */
+               if (WalWriterPID != 0 && pid == WalWriterPID)
+               {
+                       WalWriterPID = 0;
+                       if (!EXIT_STATUS_0(exitstatus))
+                               HandleChildCrash(pid, exitstatus,
+                                                                _("wal writer process"));
+                       continue;
+               }
+
                /*
                 * Was it the autovacuum launcher?  Normal exit can be ignored; we'll
                 * start a new one at the next iteration of the postmaster's main loop,
@@ -2233,7 +2272,8 @@ reaper(SIGNAL_ARGS)
                 * StartupDataBase.  (We can ignore the archiver and stats processes
                 * here since they are not connected to shmem.)
                 */
-               if (DLGetHead(BackendList) || StartupPID != 0 || BgWriterPID != 0 ||
+               if (DLGetHead(BackendList) || StartupPID != 0 ||
+                       BgWriterPID != 0 || WalWriterPID != 0 ||
                        AutoVacPID != 0)
                        goto reaper_done;
                ereport(LOG,
@@ -2249,7 +2289,8 @@ reaper(SIGNAL_ARGS)
 
        if (Shutdown > NoShutdown)
        {
-               if (DLGetHead(BackendList) || StartupPID != 0 || AutoVacPID != 0)
+               if (DLGetHead(BackendList) || StartupPID != 0 || AutoVacPID != 0 ||
+                       WalWriterPID != 0)
                        goto reaper_done;
                /* Start the bgwriter if not running */
                if (BgWriterPID == 0)
@@ -2315,7 +2356,8 @@ CleanupBackend(int pid,
 }
 
 /*
- * HandleChildCrash -- cleanup after failed backend, bgwriter, or autovacuum.
+ * HandleChildCrash -- cleanup after failed backend, bgwriter, walwriter,
+ * or autovacuum.
  *
  * The objectives here are to clean up our local state about the child
  * process, and to signal all other remaining children to quickdie.
@@ -2390,6 +2432,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
                signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
        }
 
+       /* Take care of the walwriter too */
+       if (pid == WalWriterPID)
+               WalWriterPID = 0;
+       else if (WalWriterPID != 0 && !FatalError)
+       {
+               ereport(DEBUG2,
+                               (errmsg_internal("sending %s to process %d",
+                                                                (SendStop ? "SIGSTOP" : "SIGQUIT"),
+                                                                (int) WalWriterPID)));
+               signal_child(WalWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
+       }
+
        /* Take care of the autovacuum launcher too */
        if (pid == AutoVacPID)
                AutoVacPID = 0;
@@ -3622,9 +3676,11 @@ sigusr1_handler(SIGNAL_ARGS)
                start_autovac_launcher = true;
        }
 
-       /* The autovacuum launcher wants us to start a worker process. */
        if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER))
+       {
+               /* The autovacuum launcher wants us to start a worker process. */
                StartAutovacuumWorker();
+       }
 
        PG_SETMASK(&UnBlockSig);
 
@@ -3805,6 +3861,10 @@ StartChildProcess(AuxProcType type)
                                ereport(LOG,
                                   (errmsg("could not fork background writer process: %m")));
                                break;
+                       case WalWriterProcess:
+                               ereport(LOG,
+                                  (errmsg("could not fork wal writer process: %m")));
+                               break;
                        default:
                                ereport(LOG,
                                                (errmsg("could not fork process: %m")));
diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c
new file mode 100644 (file)
index 0000000..b4d5946
--- /dev/null
@@ -0,0 +1,311 @@
+/*-------------------------------------------------------------------------
+ *
+ * walwriter.c
+ *
+ * The WAL writer background process is new as of Postgres 8.3.  It attempts
+ * to keep regular backends from having to write out (and fsync) WAL pages.
+ * Also, it guarantees that transaction commit records that weren't synced
+ * to disk immediately upon commit (ie, were "asynchronously committed")
+ * will reach disk within a knowable time --- which, as it happens, is at
+ * most three times the wal_writer_delay cycle time.
+ *
+ * Note that as with the bgwriter for shared buffers, regular backends are
+ * still empowered to issue WAL writes and fsyncs when the walwriter doesn't
+ * keep up.
+ *
+ * Because the walwriter's cycle is directly linked to the maximum delay
+ * before async-commit transactions are guaranteed committed, it's probably
+ * unwise to load additional functionality onto it.  For instance, if you've
+ * got a yen to create xlog segments further in advance, that'd be better done
+ * in bgwriter than in walwriter.
+ *
+ * The walwriter is started by the postmaster as soon as the startup subprocess
+ * finishes.  It remains alive until the postmaster commands it to terminate.
+ * Normal termination is by SIGTERM, which instructs the walwriter to exit(0).
+ * Emergency termination is by SIGQUIT; like any backend, the walwriter will
+ * simply abort and exit on SIGQUIT.
+ *
+ * If the walwriter exits unexpectedly, the postmaster treats that the same
+ * as a backend crash: shared memory may be corrupted, so remaining backends
+ * should be killed by SIGQUIT and then a recovery cycle started.
+ *
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *       $PostgreSQL: pgsql/src/backend/postmaster/walwriter.c,v 1.1 2007/07/24 04:54:09 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "access/xlog.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "postmaster/walwriter.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pmsignal.h"
+#include "storage/smgr.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/resowner.h"
+
+
+/*
+ * GUC parameters
+ */
+int                    WalWriterDelay = 200;
+
+/*
+ * Flags set by interrupt handlers for later service in the main loop.
+ */
+static volatile sig_atomic_t got_SIGHUP = false;
+static volatile sig_atomic_t shutdown_requested = false;
+
+/* Signal handlers */
+static void wal_quickdie(SIGNAL_ARGS);
+static void WalSigHupHandler(SIGNAL_ARGS);
+static void WalShutdownHandler(SIGNAL_ARGS);
+
+
+/*
+ * Main entry point for walwriter process
+ *
+ * This is invoked from BootstrapMain, which has already created the basic
+ * execution environment, but not enabled signals yet.
+ */
+void
+WalWriterMain(void)
+{
+       sigjmp_buf      local_sigjmp_buf;
+       MemoryContext walwriter_context;
+
+       /*
+        * If possible, make this process a group leader, so that the postmaster
+        * can signal any child processes too.  (walwriter probably never has
+        * any child processes, but for consistency we make all postmaster
+        * child processes do this.)
+        */
+#ifdef HAVE_SETSID
+       if (setsid() < 0)
+               elog(FATAL, "setsid() failed: %m");
+#endif
+
+       /*
+        * Properly accept or ignore signals the postmaster might send us
+        *
+        * We have no particular use for SIGINT at the moment, but seems
+        * reasonable to treat like SIGTERM.
+        */
+       pqsignal(SIGHUP, WalSigHupHandler);     /* set flag to read config file */
+       pqsignal(SIGINT, WalShutdownHandler);           /* request shutdown */
+       pqsignal(SIGTERM, WalShutdownHandler);          /* request shutdown */
+       pqsignal(SIGQUIT, wal_quickdie);                /* hard crash time */
+       pqsignal(SIGALRM, SIG_IGN);
+       pqsignal(SIGPIPE, SIG_IGN);
+       pqsignal(SIGUSR1, SIG_IGN); /* reserve for sinval */
+       pqsignal(SIGUSR2, SIG_IGN);     /* not used */
+
+       /*
+        * Reset some signals that are accepted by postmaster but not here
+        */
+       pqsignal(SIGCHLD, SIG_DFL);
+       pqsignal(SIGTTIN, SIG_DFL);
+       pqsignal(SIGTTOU, SIG_DFL);
+       pqsignal(SIGCONT, SIG_DFL);
+       pqsignal(SIGWINCH, SIG_DFL);
+
+       /* We allow SIGQUIT (quickdie) at all times */
+#ifdef HAVE_SIGPROCMASK
+       sigdelset(&BlockSig, SIGQUIT);
+#else
+       BlockSig &= ~(sigmask(SIGQUIT));
+#endif
+
+       /*
+        * Create a resource owner to keep track of our resources (not clear
+        * that we need this, but may as well have one).
+        */
+       CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Writer");
+
+       /*
+        * Create a memory context that we will do all our work in.  We do this so
+        * that we can reset the context during error recovery and thereby avoid
+        * possible memory leaks.  Formerly this code just ran in
+        * TopMemoryContext, but resetting that would be a really bad idea.
+        */
+       walwriter_context = AllocSetContextCreate(TopMemoryContext,
+                                                                                         "Wal Writer",
+                                                                                         ALLOCSET_DEFAULT_MINSIZE,
+                                                                                         ALLOCSET_DEFAULT_INITSIZE,
+                                                                                         ALLOCSET_DEFAULT_MAXSIZE);
+       MemoryContextSwitchTo(walwriter_context);
+
+       /*
+        * If an exception is encountered, processing resumes here.
+        *
+        * This code is heavily based on bgwriter.c, q.v.
+        */
+       if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+       {
+               /* Since not using PG_TRY, must reset error stack by hand */
+               error_context_stack = NULL;
+
+               /* Prevent interrupts while cleaning up */
+               HOLD_INTERRUPTS();
+
+               /* Report the error to the server log */
+               EmitErrorReport();
+
+               /*
+                * These operations are really just a minimal subset of
+                * AbortTransaction().  We don't have very many resources to worry
+                * about in walwriter, but we do have LWLocks, and perhaps buffers?
+                */
+               LWLockReleaseAll();
+               AbortBufferIO();
+               UnlockBuffers();
+               /* buffer pins are released here: */
+               ResourceOwnerRelease(CurrentResourceOwner,
+                                                        RESOURCE_RELEASE_BEFORE_LOCKS,
+                                                        false, true);
+               /* we needn't bother with the other ResourceOwnerRelease phases */
+               AtEOXact_Buffers(false);
+
+               /*
+                * Now return to normal top-level context and clear ErrorContext for
+                * next time.
+                */
+               MemoryContextSwitchTo(walwriter_context);
+               FlushErrorState();
+
+               /* Flush any leaked data in the top-level context */
+               MemoryContextResetAndDeleteChildren(walwriter_context);
+
+               /* Now we can allow interrupts again */
+               RESUME_INTERRUPTS();
+
+               /*
+                * Sleep at least 1 second after any error.  A write error is likely
+                * to be repeated, and we don't want to be filling the error logs as
+                * fast as we can.
+                */
+               pg_usleep(1000000L);
+
+               /*
+                * Close all open files after any error.  This is helpful on Windows,
+                * where holding deleted files open causes various strange errors.
+                * It's not clear we need it elsewhere, but shouldn't hurt.
+                */
+               smgrcloseall();
+       }
+
+       /* We can now handle ereport(ERROR) */
+       PG_exception_stack = &local_sigjmp_buf;
+
+       /*
+        * Unblock signals (they were blocked when the postmaster forked us)
+        */
+       PG_SETMASK(&UnBlockSig);
+
+       /*
+        * Loop forever
+        */
+       for (;;)
+       {
+               long            udelay;
+
+               /*
+                * Emergency bailout if postmaster has died.  This is to avoid the
+                * necessity for manual cleanup of all postmaster children.
+                */
+               if (!PostmasterIsAlive(true))
+                       exit(1);
+
+               /*
+                * Process any requests or signals received recently.
+                */
+               if (got_SIGHUP)
+               {
+                       got_SIGHUP = false;
+                       ProcessConfigFile(PGC_SIGHUP);
+               }
+               if (shutdown_requested)
+               {
+                       /* Normal exit from the walwriter is here */
+                       proc_exit(0);           /* done */
+               }
+
+               /*
+                * Do what we're here for...
+                */
+               XLogBackgroundFlush();
+
+               /*
+                * Delay until time to do something more, but fall out of delay
+                * reasonably quickly if signaled.
+                */
+               udelay = WalWriterDelay * 1000L;
+               while (udelay > 999999L)
+               {
+                       if (got_SIGHUP || shutdown_requested)
+                               break;
+                       pg_usleep(1000000L);
+                       udelay -= 1000000L;
+               }
+               if (!(got_SIGHUP || shutdown_requested))
+                       pg_usleep(udelay);
+       }
+}
+
+
+/* --------------------------------
+ *             signal handler routines
+ * --------------------------------
+ */
+
+/*
+ * wal_quickdie() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+static void
+wal_quickdie(SIGNAL_ARGS)
+{
+       PG_SETMASK(&BlockSig);
+
+       /*
+        * DO NOT proc_exit() -- we're here because shared memory may be
+        * corrupted, so we don't want to try to clean up our transaction. Just
+        * nail the windows shut and get out of town.
+        *
+        * Note we do exit(2) not exit(0).      This is to force the postmaster into a
+        * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+        * backend.  This is necessary precisely because we don't clean up our
+        * shared memory state.
+        */
+       exit(2);
+}
+
+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+WalSigHupHandler(SIGNAL_ARGS)
+{
+       got_SIGHUP = true;
+}
+
+/* SIGTERM: set flag to exit normally */
+static void
+WalShutdownHandler(SIGNAL_ARGS)
+{
+       shutdown_requested = true;
+}
index 06915017e6e794fe16bce272fa091bfed97b83ad..b2d0ea9cae597f6d6397e2695bceecb6795b9fc3 100644 (file)
@@ -10,7 +10,7 @@
  * Written by Peter Eisentraut <peter_e@gmx.net>.
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.406 2007/07/24 01:53:56 alvherre Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.407 2007/07/24 04:54:09 tgl Exp $
  *
  *--------------------------------------------------------------------
  */
@@ -54,6 +54,7 @@
 #include "postmaster/bgwriter.h"
 #include "postmaster/postmaster.h"
 #include "postmaster/syslogger.h"
+#include "postmaster/walwriter.h"
 #include "storage/fd.h"
 #include "storage/freespace.h"
 #include "tcop/tcopprot.h"
@@ -1509,6 +1510,16 @@ static struct config_int ConfigureNamesInt[] =
                8, 4, INT_MAX, NULL, NULL
        },
 
+       {
+               {"wal_writer_delay", PGC_SIGHUP, WAL_SETTINGS,
+                       gettext_noop("WAL writer sleep time between WAL flushes."),
+                       NULL,
+                       GUC_UNIT_MS
+               },
+               &WalWriterDelay,
+               200, 1, 10000, NULL, NULL
+       },
+
        {
                {"commit_delay", PGC_USERSET, WAL_CHECKPOINTS,
                        gettext_noop("Sets the delay in microseconds between transaction commit and "
index 51c83ade0afdf324af8d616576ef26c29228fadd..8bfad997ff38cf96b49d0eb333b899e270d0ae99 100644 (file)
 #full_page_writes = on                 # recover from partial page writes
 #wal_buffers = 64kB                    # min 32kB
                                        # (change requires restart)
+#wal_writer_delay = 200ms              # range 1-10000, in milliseconds
+
 #commit_delay = 0                      # range 0-100000, in microseconds
 #commit_siblings = 5                   # range 1-1000
 
index 1b4fecdb966f11f485360e28345c109cedcce15c..adc99a6eb0610e0f551279e8d721003bfd992106 100644 (file)
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.80 2007/06/30 19:12:02 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.81 2007/07/24 04:54:09 tgl Exp $
  */
 #ifndef XLOG_H
 #define XLOG_H
@@ -196,6 +196,7 @@ extern CheckpointStatsData CheckpointStats;
 
 extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
 extern void XLogFlush(XLogRecPtr RecPtr);
+extern void XLogBackgroundFlush(void);
 extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
 
 extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
index bbde68ea1b14f0211640498f3fc35de50e931da6..d75626c8d257f5de764265270c9c689811472496 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/bootstrap/bootstrap.h,v 1.46 2007/03/07 13:35:03 alvherre Exp $
+ * $PostgreSQL: pgsql/src/include/bootstrap/bootstrap.h,v 1.47 2007/07/24 04:54:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -69,7 +69,8 @@ typedef enum
        CheckerProcess,
        BootstrapProcess,
        StartupProcess,
-       BgWriterProcess
+       BgWriterProcess,
+       WalWriterProcess
 } AuxProcType;
 
 #endif   /* BOOTSTRAP_H */
diff --git a/src/include/postmaster/walwriter.h b/src/include/postmaster/walwriter.h
new file mode 100644 (file)
index 0000000..3cefe9a
--- /dev/null
@@ -0,0 +1,20 @@
+/*-------------------------------------------------------------------------
+ *
+ * walwriter.h
+ *       Exports from postmaster/walwriter.c.
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ *
+ * $PostgreSQL: pgsql/src/include/postmaster/walwriter.h,v 1.1 2007/07/24 04:54:09 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _WALWRITER_H
+#define _WALWRITER_H
+
+/* GUC options */
+extern int     WalWriterDelay;
+
+extern void WalWriterMain(void);
+
+#endif   /* _WALWRITER_H */