]> granicus.if.org Git - postgresql/commitdiff
Don't use O_DIRECT when writing WAL files if archiving or streaming is
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Fri, 19 Feb 2010 10:51:04 +0000 (10:51 +0000)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Fri, 19 Feb 2010 10:51:04 +0000 (10:51 +0000)
enabled. Bypassing the kernel cache is counter-productive in that case,
because the archiver/walsender process will read from the WAL file
soon after it's written, and if it's not cached the read will cause
a physical read, eating I/O bandwidth available on the WAL drive.

Also, walreceiver process does unaligned writes, so disable O_DIRECT
in walreceiver process for that reason too.

src/backend/access/transam/xlog.c
src/backend/replication/walreceiver.c
src/include/access/xlogdefs.h
src/include/replication/walreceiver.h

index 106d39b760156ecc979344ffd536be743083925a..046d80fa95dc62ae71f1852ee08e3106ce2cb417 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.376 2010/02/19 01:04:03 itagaki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.377 2010/02/19 10:51:03 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -2686,13 +2686,10 @@ XLogFileClose(void)
         * WAL segment files will not be re-read in normal operation, so we advise
         * the OS to release any cached pages.  But do not do so if WAL archiving
         * or streaming is active, because archiver and walsender process could use
-        * the cache to read the WAL segment.  Also, don't bother with it if we
-        * are using O_DIRECT, since the kernel is presumably not caching in that
-        * case.
+        * the cache to read the WAL segment.
         */
 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-       if (!XLogIsNeeded() &&
-               (get_sync_bit(sync_method) & PG_O_DIRECT) == 0)
+       if (!XLogIsNeeded())
                (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
 #endif
 
@@ -7652,10 +7649,29 @@ xlog_outrec(StringInfo buf, XLogRecord *record)
 static int
 get_sync_bit(int method)
 {
+       int o_direct_flag = 0;
+
        /* If fsync is disabled, never open in sync mode */
        if (!enableFsync)
                return 0;
 
+       /*
+        * Optimize writes by bypassing kernel cache with O_DIRECT when using
+        * O_SYNC, O_DSYNC or O_FSYNC. But only if archiving and streaming are
+        * disabled, otherwise the archive command or walsender process will
+        * read the WAL soon after writing it, which is guaranteed to cause a
+        * physical read if we bypassed the kernel cache. We also skip the
+        * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the
+        * same reason.
+        *
+        * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
+        * written by walreceiver is normally read by the startup process soon
+        * after its written. Also, walreceiver performs unaligned writes, which
+        * don't work with O_DIRECT, so it is required for correctness too.
+        */
+       if (!XLogIsNeeded() && !am_walreceiver)
+               o_direct_flag = PG_O_DIRECT;
+
        switch (method)
        {
                        /*
@@ -7670,11 +7686,11 @@ get_sync_bit(int method)
                        return 0;
 #ifdef OPEN_SYNC_FLAG
                case SYNC_METHOD_OPEN:
-                       return OPEN_SYNC_FLAG;
+                       return OPEN_SYNC_FLAG | o_direct_flag;
 #endif
 #ifdef OPEN_DATASYNC_FLAG
                case SYNC_METHOD_OPEN_DSYNC:
-                       return OPEN_DATASYNC_FLAG;
+                       return OPEN_DATASYNC_FLAG | o_direct_flag;
 #endif
                default:
                        /* can't happen (unless we are out of sync with option array) */
index 0e57611da4252dfe6f7aca090428b78a2c4e65ca..3f82693dcea0ff8997ff5ec5c1c28b3667313a70 100644 (file)
@@ -29,7 +29,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.4 2010/02/17 04:19:39 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.5 2010/02/19 10:51:04 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -50,6 +50,9 @@
 #include "utils/ps_status.h"
 #include "utils/resowner.h"
 
+/* Global variable to indicate if this process is a walreceiver process */
+bool am_walreceiver;
+
 /* libpqreceiver hooks to these when loaded */
 walrcv_connect_type walrcv_connect = NULL;
 walrcv_receive_type walrcv_receive = NULL;
@@ -158,6 +161,8 @@ WalReceiverMain(void)
        /* use volatile pointer to prevent code rearrangement */
        volatile WalRcvData *walrcv = WalRcv;
 
+       am_walreceiver = true;
+
        /*
         * WalRcv should be set up already (if we are a backend, we inherit
         * this by fork() or EXEC_BACKEND mechanism from the postmaster).
@@ -424,16 +429,18 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr)
                        bool    use_existent;
 
                        /*
-                        * XLOG segment files will be re-read in recovery operation soon,
-                        * so we don't need to advise the OS to release any cache page.
+                        * fsync() and close current file before we switch to next one.
+                        * We would otherwise have to reopen this file to fsync it later
                         */
                        if (recvFile >= 0)
                        {
+                               XLogWalRcvFlush();
+
                                /*
-                                * fsync() before we switch to next file. We would otherwise
-                                * have to reopen this file to fsync it later
+                                * XLOG segment files will be re-read by recovery in startup
+                                * process soon, so we don't advise the OS to release cache
+                                * pages associated with the file like XLogFileClose() does.
                                 */
-                               XLogWalRcvFlush();
                                if (close(recvFile) != 0)
                                        ereport(PANIC,
                                                        (errcode_for_file_access(),
@@ -445,8 +452,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr)
                        /* Create/use new log file */
                        XLByteToSeg(recptr, recvId, recvSeg);
                        use_existent = true;
-                       recvFile = XLogFileInit(recvId, recvSeg,
-                                                                       &use_existent, true);
+                       recvFile = XLogFileInit(recvId, recvSeg, &use_existent, true);
                        recvOff = 0;
                }
 
index 8ecc3a21b1c4952034e3926f44d67a995f3bedfc..0760b259308c627a9ad1603550db7e2f78fe4feb 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.25 2010/01/15 09:19:06 heikki Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.26 2010/02/19 10:51:04 heikki Exp $
  */
 #ifndef XLOG_DEFS_H
 #define XLOG_DEFS_H
@@ -106,23 +106,20 @@ typedef uint32 TimeLineID;
  * configure determined whether fdatasync() is.
  */
 #if defined(O_SYNC)
-#define BARE_OPEN_SYNC_FLAG            O_SYNC
+#define OPEN_SYNC_FLAG         O_SYNC
 #elif defined(O_FSYNC)
-#define BARE_OPEN_SYNC_FLAG            O_FSYNC
-#endif
-#ifdef BARE_OPEN_SYNC_FLAG
-#define OPEN_SYNC_FLAG                 (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)
+#define OPEN_SYNC_FLAG         O_FSYNC
 #endif
 
 #if defined(O_DSYNC)
 #if defined(OPEN_SYNC_FLAG)
 /* O_DSYNC is distinct? */
-#if O_DSYNC != BARE_OPEN_SYNC_FLAG
-#define OPEN_DATASYNC_FLAG             (O_DSYNC | PG_O_DIRECT)
+#if O_DSYNC != OPEN_SYNC_FLAG
+#define OPEN_DATASYNC_FLAG             O_DSYNC
 #endif
 #else                                                  /* !defined(OPEN_SYNC_FLAG) */
 /* Win32 only has O_DSYNC */
-#define OPEN_DATASYNC_FLAG             (O_DSYNC | PG_O_DIRECT)
+#define OPEN_DATASYNC_FLAG             O_DSYNC
 #endif
 #endif
 
index bf7ad41b068da3e0945eb65bf7e255f7e751e306..56af60560e7548853e160f1b3463b4f25a9a1ef2 100644 (file)
@@ -5,7 +5,7 @@
  *
  * Portions Copyright (c) 2010-2010, PostgreSQL Global Development Group
  *
- * $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.6 2010/02/03 09:47:19 heikki Exp $
+ * $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.7 2010/02/19 10:51:04 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -15,6 +15,8 @@
 #include "access/xlogdefs.h"
 #include "storage/spin.h"
 
+extern bool am_walreceiver;
+
 /*
  * MAXCONNINFO: maximum size of a connection string.
  *