]> granicus.if.org Git - postgresql/commitdiff
Use O_DIRECT if available when using O_SYNC for wal_sync_method.
authorBruce Momjian <bruce@momjian.us>
Fri, 29 Jul 2005 03:22:33 +0000 (03:22 +0000)
committerBruce Momjian <bruce@momjian.us>
Fri, 29 Jul 2005 03:22:33 +0000 (03:22 +0000)
Also, write multiple WAL buffers out in one write() operation.

ITAGAKI Takahiro

---------------------------------------------------------------------------

> If we disable writeback-cache and use open_sync, the per-page writing
> behavior in WAL module will show up as bad result. O_DIRECT is similar
> to O_DSYNC (at least on linux), so that the benefit of it will disappear
> behind the slow disk revolution.
>
> In the current source, WAL is written as:
>     for (i = 0; i < N; i++) { write(&buffers[i], BLCKSZ); }
> Is this intentional? Can we rewrite it as follows?
>    write(&buffers[0], N * BLCKSZ);
>
> In order to achieve it, I wrote a 'gather-write' patch (xlog.gw.diff).
> Aside from this, I'll also send the fixed direct io patch (xlog.dio.diff).
> These two patches are independent, so they can be applied either or both.
>
>
> I tested them on my machine and the results as follows. It shows that
> direct-io and gather-write is the best choice when writeback-cache is off.
> Are these two patches worth trying if they are used together?
>
>
>             | writeback | fsync= | fdata | open_ | fsync_ | open_
> patch       | cache     |  false |  sync |  sync | direct | direct
> ------------+-----------+--------+-------+-------+--------+---------
> direct io   | off       |  124.2 | 105.7 |  48.3 |   48.3 |  48.2
> direct io   | on        |  129.1 | 112.3 | 114.1 |  142.9 | 144.5
> gather-write| off       |  124.3 | 108.7 | 105.4 |  (N/A) | (N/A)
> both        | off       |  131.5 | 115.5 | 114.4 |  145.4 | 145.2
>
> - 20runs * pgbench -s 100 -c 50 -t 200
>    - with tuning (wal_buffers=64, commit_delay=500, checkpoint_segments=8)
> - using 2 ATA disks:
>    - hda(reiserfs) includes system and wal.
>    - hdc(jfs) includes database files. writeback-cache is always on.
>
> ---
> ITAGAKI Takahiro

src/backend/access/transam/xlog.c

index 1204ae34bf43a14105154348456f2b9118b5daee..fc9264cfcd9910dc8d7b7ac30ff35b694b9be452 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.210 2005/07/23 15:31:16 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.211 2005/07/29 03:22:33 momjian Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "utils/relcache.h"
 
 
+/*
+ *     Becauase O_DIRECT bypasses the kernel buffers, and because we never
+ *     read those buffers except during crash recovery, it is a win to use
+ *     it in all cases where we sync on each write().  We could allow O_DIRECT
+ *     with fsync(), but because skipping the kernel buffer forces writes out
+ *     quickly, it seems best just to use it for O_SYNC.  It is hard to imagine
+ *     how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
+ */
+#ifdef O_DIRECT
+#define PG_O_DIRECT                            O_DIRECT
+#else
+#define PG_O_DIRECT                            0
+#endif
+
 /*
  * This chunk of hackery attempts to determine which file sync methods
  * are available on the current platform, and to choose an appropriate
  * configure determined whether fdatasync() is.
  */
 #if defined(O_SYNC)
-#define OPEN_SYNC_FLAG                 O_SYNC
+#define CMP_OPEN_SYNC_FLAG             O_SYNC
 #else
 #if defined(O_FSYNC)
-#define OPEN_SYNC_FLAG                 O_FSYNC
+#define CMP_OPEN_SYNC_FLAG             O_FSYNC
 #endif
 #endif
+#define OPEN_SYNC_FLAG                 (CMP_OPEN_SYNC_FLAG | PG_O_DIRECT)
 
 #if defined(O_DSYNC)
 #if defined(OPEN_SYNC_FLAG)
-#if O_DSYNC != OPEN_SYNC_FLAG
-#define OPEN_DATASYNC_FLAG             O_DSYNC
+#if O_DSYNC != CMP_OPEN_SYNC_FLAG
+#define OPEN_DATASYNC_FLAG             (O_DSYNC | PG_O_DIRECT)
 #endif
 #else /* !defined(OPEN_SYNC_FLAG) */
 /* Win32 only has O_DSYNC */
-#define OPEN_DATASYNC_FLAG             O_DSYNC
+#define OPEN_DATASYNC_FLAG             (O_DSYNC | PG_O_DIRECT)
 #endif
 #endif
 
+/*
+ * Limitation of buffer-alignment for direct io depend on OS and filesystem,
+ * but BLCKSZ is assumed to be enough for it. 
+ */
+#ifdef O_DIRECT
+#define ALIGNOF_XLOG_BUFFER            BLCKSZ
+#else
+#define ALIGNOF_XLOG_BUFFER            MAXIMUM_ALIGNOF
+#endif
+
+/*
+ * Switch the alignment routine because ShmemAlloc() returns a max-aligned
+ * buffer and ALIGNOF_XLOG_BUFFER may be greater than MAXIMUM_ALIGNOF.
+ */
+#if ALIGNOF_XLOG_BUFFER <= MAXIMUM_ALIGNOF
+#define XLOG_BUFFER_ALIGN(LEN) MAXALIGN((LEN))
+#else
+#define XLOG_BUFFER_ALIGN(LEN) ((LEN) + (ALIGNOF_XLOG_BUFFER))
+#endif
+/* assume sizeof(ptrdiff_t) == sizeof(void*) */
+#define POINTERALIGN(ALIGNVAL,PTR)     \
+       ((char *)(((ptrdiff_t) (PTR) + (ALIGNVAL-1)) & ~((ptrdiff_t) (ALIGNVAL-1))))
+#define XLOG_BUFFER_POINTERALIGN(PTR)  \
+       POINTERALIGN((ALIGNOF_XLOG_BUFFER), (PTR))
+
 #if defined(OPEN_DATASYNC_FLAG)
 #define DEFAULT_SYNC_METHOD_STR        "open_datasync"
 #define DEFAULT_SYNC_METHOD            SYNC_METHOD_OPEN
@@ -469,6 +509,17 @@ static void ReadControlFile(void);
 static char *str_time(time_t tnow);
 static void issue_xlog_fsync(void);
 
+/* XLog gather-write staffs */
+typedef struct XLogPages
+{
+       char    *head;          /* Head of first page */
+       int              size;          /* Total bytes of pages == count(pages) * BLCKSZ */
+       int              offset;        /* Offset in xlog segment file  */
+} XLogPages;
+static void XLogPageReset(XLogPages *pages);
+static void XLogPageWrite(XLogPages *pages, int index);
+static void XLogPageFlush(XLogPages *pages, int index);
+
 #ifdef WAL_DEBUG
 static void xlog_outrec(char *buf, XLogRecord *record);
 #endif
@@ -1245,9 +1296,10 @@ static void
 XLogWrite(XLogwrtRqst WriteRqst)
 {
        XLogCtlWrite *Write = &XLogCtl->Write;
-       char       *from;
        bool            ispartialpage;
        bool            use_existent;
+       int                     currentIndex = Write->curridx;
+       XLogPages       pages;
 
        /* We should always be inside a critical section here */
        Assert(CritSectionCount > 0);
@@ -1258,6 +1310,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
         */
        LogwrtResult = Write->LogwrtResult;
 
+       XLogPageReset(&pages);
+
        while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
        {
                /*
@@ -1266,14 +1320,14 @@ XLogWrite(XLogwrtRqst WriteRqst)
                 * end of the last page that's been initialized by
                 * AdvanceXLInsertBuffer.
                 */
-               if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx]))
+               if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[currentIndex]))
                        elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
                                 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
-                                XLogCtl->xlblocks[Write->curridx].xlogid,
-                                XLogCtl->xlblocks[Write->curridx].xrecoff);
+                                XLogCtl->xlblocks[currentIndex].xlogid,
+                                XLogCtl->xlblocks[currentIndex].xrecoff);
 
                /* Advance LogwrtResult.Write to end of current buffer page */
-               LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx];
+               LogwrtResult.Write = XLogCtl->xlblocks[currentIndex];
                ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
 
                if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
@@ -1281,6 +1335,7 @@ XLogWrite(XLogwrtRqst WriteRqst)
                        /*
                         * Switch to new logfile segment.
                         */
+                       XLogPageFlush(&pages, currentIndex);
                        if (openLogFile >= 0)
                        {
                                if (close(openLogFile))
@@ -1354,31 +1409,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
                        openLogOff = 0;
                }
 
-               /* Need to seek in the file? */
-               if (openLogOff != (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize)
-               {
-                       openLogOff = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
-                       if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
-                               ereport(PANIC,
-                                               (errcode_for_file_access(),
-                                                errmsg("could not seek in log file %u, segment %u to offset %u: %m",
-                                                               openLogId, openLogSeg, openLogOff)));
-               }
-
-               /* OK to write the page */
-               from = XLogCtl->pages + Write->curridx * BLCKSZ;
-               errno = 0;
-               if (write(openLogFile, from, BLCKSZ) != BLCKSZ)
-               {
-                       /* if write didn't set errno, assume problem is no disk space */
-                       if (errno == 0)
-                               errno = ENOSPC;
-                       ereport(PANIC,
-                                       (errcode_for_file_access(),
-                                        errmsg("could not write to log file %u, segment %u at offset %u: %m",
-                                                       openLogId, openLogSeg, openLogOff)));
-               }
-               openLogOff += BLCKSZ;
+               /* Add a page to buffer */
+               XLogPageWrite(&pages, currentIndex);
 
                /*
                 * If we just wrote the whole last page of a logfile segment,
@@ -1390,8 +1422,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
                 * This is also the right place to notify the Archiver that the
                 * segment is ready to copy to archival storage.
                 */
-               if (openLogOff >= XLogSegSize && !ispartialpage)
+               if (openLogOff + pages.size >= XLogSegSize && !ispartialpage)
                {
+                       XLogPageFlush(&pages, currentIndex);
                        issue_xlog_fsync();
                        LogwrtResult.Flush = LogwrtResult.Write;        /* end of current page */
 
@@ -1405,8 +1438,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
                        LogwrtResult.Write = WriteRqst.Write;
                        break;
                }
-               Write->curridx = NextBufIdx(Write->curridx);
+               currentIndex = NextBufIdx(currentIndex);
        }
+       XLogPageFlush(&pages, currentIndex);
 
        /*
         * If asked to flush, do so
@@ -3584,7 +3618,7 @@ XLOGShmemSize(void)
        if (XLOGbuffers < MinXLOGbuffers)
                XLOGbuffers = MinXLOGbuffers;
 
-       return MAXALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
+       return XLOG_BUFFER_ALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
                + BLCKSZ * XLOGbuffers +
                MAXALIGN(sizeof(ControlFileData));
 }
@@ -3601,7 +3635,7 @@ XLOGShmemInit(void)
 
        XLogCtl = (XLogCtlData *)
                ShmemInitStruct("XLOG Ctl",
-                                               MAXALIGN(sizeof(XLogCtlData) +
+                                               XLOG_BUFFER_ALIGN(sizeof(XLogCtlData) +
                                                                 sizeof(XLogRecPtr) * XLOGbuffers)
                                                + BLCKSZ * XLOGbuffers,
                                                &foundXLog);
@@ -3630,9 +3664,9 @@ XLOGShmemInit(void)
         * Here, on the other hand, we must MAXALIGN to ensure the page
         * buffers have worst-case alignment.
         */
-       XLogCtl->pages =
-               ((char *) XLogCtl) + MAXALIGN(sizeof(XLogCtlData) +
-                                                                         sizeof(XLogRecPtr) * XLOGbuffers);
+       XLogCtl->pages = XLOG_BUFFER_POINTERALIGN(
+               ((char *) XLogCtl)
+               + sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers);
        memset(XLogCtl->pages, 0, BLCKSZ * XLOGbuffers);
 
        /*
@@ -3690,10 +3724,9 @@ BootStrapXLOG(void)
        /* First timeline ID is always 1 */
        ThisTimeLineID = 1;
 
-       /* Use malloc() to ensure buffer is MAXALIGNED */
-       buffer = (char *) malloc(BLCKSZ);
-       page = (XLogPageHeader) buffer;
-       memset(buffer, 0, BLCKSZ);
+       buffer = (char *) malloc(BLCKSZ + ALIGNOF_XLOG_BUFFER);
+       page = (XLogPageHeader) XLOG_BUFFER_POINTERALIGN(buffer);
+       memset(page, 0, BLCKSZ);
 
        /* Set up information for the initial checkpoint record */
        checkPoint.redo.xlogid = 0;
@@ -3745,7 +3778,7 @@ BootStrapXLOG(void)
 
        /* Write the first page with the initial record */
        errno = 0;
-       if (write(openLogFile, buffer, BLCKSZ) != BLCKSZ)
+       if (write(openLogFile, page, BLCKSZ) != BLCKSZ)
        {
                /* if write didn't set errno, assume problem is no disk space */
                if (errno == 0)
@@ -5837,3 +5870,71 @@ remove_backup_label(void)
                                         errmsg("could not remove file \"%s\": %m",
                                                        BACKUP_LABEL_FILE)));
 }
+
+
+/* XLog gather-write staffs */
+
+static void
+XLogPageReset(XLogPages *pages)
+{
+       memset(pages, 0, sizeof(*pages));
+}
+
+static void
+XLogPageWrite(XLogPages *pages, int index)
+{
+       char *page = XLogCtl->pages + index * BLCKSZ;
+       int size = BLCKSZ;
+       int offset = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
+
+       if (pages->head + pages->size == page
+               && pages->offset + pages->size == offset)
+       {       /* Pages are continuous. Append new page. */
+               pages->size += size;
+       }
+       else
+       {       /* Pages are not continuous. Flush and clear. */
+               XLogPageFlush(pages, PrevBufIdx(index));
+               pages->head = page;
+               pages->size = size;
+               pages->offset = offset;
+       }
+}
+
+static void
+XLogPageFlush(XLogPages *pages, int index)
+{
+       if (!pages->head)
+       {       /* No needs to write pages. */
+               XLogCtl->Write.curridx = index;
+               return;
+       }
+       
+       /* Need to seek in the file? */
+       if (openLogOff != pages->offset)
+       {
+               openLogOff = pages->offset;
+               if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
+                       ereport(PANIC,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not seek in log file %u, segment %u to offset %u: %m",
+                                                       openLogId, openLogSeg, openLogOff)));
+       }
+
+       /* OK to write the page */
+       errno = 0;
+       if (write(openLogFile, pages->head, pages->size) != pages->size)
+       {
+               /* if write didn't set errno, assume problem is no disk space */
+               if (errno == 0)
+                       errno = ENOSPC;
+               ereport(PANIC,
+                               (errcode_for_file_access(),
+                                errmsg("could not write to log file %u, segment %u at offset %u: %m",
+                                               openLogId, openLogSeg, openLogOff)));
+       }
+
+       openLogOff += pages->size;
+       XLogCtl->Write.curridx = index;
+       XLogPageReset(pages);
+}