Make commit_delay much smarter.

author Robert Haas <rhaas@postgresql.org>

Mon, 2 Jul 2012 14:26:31 +0000 (10:26 -0400)

committer Robert Haas <rhaas@postgresql.org>

Mon, 2 Jul 2012 14:26:31 +0000 (10:26 -0400)
author Robert Haas <rhaas@postgresql.org>
Mon, 2 Jul 2012 14:26:31 +0000 (10:26 -0400)
committer Robert Haas <rhaas@postgresql.org>
Mon, 2 Jul 2012 14:26:31 +0000 (10:26 -0400)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index 074afee494eca127d31617551676bbebf74906fd..4e0492b939399a167df87d448560d5b1712386bb 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1866,23 +1866,26 @@ SET ENABLE_SEQSCAN TO OFF;
        </indexterm>
        <listitem>
         <para>
-        When the commit data for a transaction is flushed to disk, any
-        additional commits ready at that time are also flushed out.
          <varname>commit_delay</varname> adds a time delay, set in
-        microseconds, before a transaction attempts to
-        flush the WAL buffer out to disk.  A nonzero delay can allow more
-        transactions to be committed with only one flush operation, if
-        system load is high enough that additional transactions become
-        ready to commit within the given interval. But the delay is
-        just wasted if no other transactions become ready to
-        commit. Therefore, the delay is only performed if at least
-        <varname>commit_siblings</varname> other transactions are
-        active at the instant that a server process has written its
-        commit record.
-        The default <varname>commit_delay</> is zero (no delay).
-        Since all pending commit data will be written at every flush
-        regardless of this setting, it is rare that adding delay
-        by increasing this parameter will actually improve performance.
+        microseconds, before a WAL flush is initiated.  This can improve
+        group commit throughput by allowing a larger number of transactions
+        to commit via a single WAL flush, if system load is high enough
+        that additional transactions become ready to commit within the
+        given interval.  However, it also increases latency by up to
+        <varname>commit_delay</varname> microseconds for each WAL
+        flush.  Because the delay is just wasted if no other transactions
+        become ready to commit, it is only performed if at least
+        <varname>commit_siblings</varname> other transactions are active
+        immediately before a flush would otherwise have been initiated.
+        In <productname>PostgreSQL</> releases prior to 9.3,
+        <varname>commit_delay</varname> behaved differently and was much
+        less effective: it affected only commits, rather than all WAL flushes,
+        and waited for the entire configured delay even if the WAL flush
+        was completed sooner.  Beginning in <productname>PostgreSQL</> 9.3, 
+        the first process that becomes ready to flush waits for the configured
+        interval, while subsequent processes wait only until the leader
+        completes the flush.  The default <varname>commit_delay</> is zero
+        (no delay).
         </para>
        </listitem>
       </varlistentry>
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml

index 0afb9d6af600da01c943534f0089f97dc311c8d4..a98132d3f2a1f8c16d3a15d26d5d6251d87ca3cb 100644 (file)
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -376,9 +376,7 @@
     <acronym>WAL</acronym> to disk, in the hope that a single flush
     executed by one such transaction can also serve other transactions
     committing at about the same time.  Setting <varname>commit_delay</varname>
-   can only help when there are many concurrently committing transactions,
-   and it is difficult to tune it to a value that actually helps rather
-   than hurt throughput.
+   can only help when there are many concurrently committing transactions.
    </para>
  
   </sect1>
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c

index 86b1afa80d9330bcf5ac5adb223b7ebc00c08f53..49def6abbb66a2738c256dc6ae962f2837676eea 100644 (file)
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -68,9 +68,6 @@ bool          XactDeferrable;
  
  int                    synchronous_commit = SYNCHRONOUS_COMMIT_ON;
  
-int                    CommitDelay = 0;        /* precommit delay in microseconds */
-int                    CommitSiblings = 5; /* # concurrent xacts needed to sleep */
-
  /*
   * MyXactAccessedTempRel is set when a temporary relation is accessed.
   * We don't allow PREPARE TRANSACTION in that case.  (This is global
@@ -1123,22 +1120,6 @@ RecordTransactionCommit(void)
         if ((wrote_xlog && synchronous_commit > SYNCHRONOUS_COMMIT_OFF) ||
                 forceSyncCommit || nrels > 0)
         {
-               /*
-                * Synchronous commit case:
-                *
-                * Sleep before flush! So we can flush more than one commit records
-                * per single fsync.  (The idea is some other backend may do the
-                * XLogFlush while we're sleeping.  This needs work still, because on
-                * most Unixen, the minimum select() delay is 10msec or more, which is
-                * way too long.)
-                *
-                * We do not sleep if enableFsync is not turned on, nor if there are
-                * fewer than CommitSiblings other backends with active transactions.
-                */
-               if (CommitDelay > 0 && enableFsync &&
-                       MinimumActiveBackends(CommitSiblings))
-                       pg_usleep(CommitDelay);
-
                 XLogFlush(XactLastRecEnd);
  
                 /*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index a43e2eeaf306eb15146abf8e7a253fa9f38cdb50..6ee50d01d52a1abf892790841c07d646929ebb32 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -80,6 +80,8 @@ bool          fullPageWrites = true;
  bool           log_checkpoints = false;
  int                    sync_method = DEFAULT_SYNC_METHOD;
  int                    wal_level = WAL_LEVEL_MINIMAL;
+int                    CommitDelay = 0;        /* precommit delay in microseconds */
+int                    CommitSiblings = 5; /* # concurrent xacts needed to sleep */
  
  #ifdef WAL_DEBUG
  bool           XLOG_DEBUG = false;
@@ -2098,34 +2100,49 @@ XLogFlush(XLogRecPtr record)
                          */
                         continue;
                 }
-               /* Got the lock */
+
+               /* Got the lock; recheck whether request is satisfied */
                 LogwrtResult = XLogCtl->LogwrtResult;
-               if (!XLByteLE(record, LogwrtResult.Flush))
+               if (XLByteLE(record, LogwrtResult.Flush))
+                       break;
+
+               /*
+                * Sleep before flush! By adding a delay here, we may give further
+                * backends the opportunity to join the backlog of group commit
+                * followers; this can significantly improve transaction throughput, at
+                * the risk of increasing transaction latency.
+                *
+                * We do not sleep if enableFsync is not turned on, nor if there are
+                * fewer than CommitSiblings other backends with active transactions.
+                */
+               if (CommitDelay > 0 && enableFsync &&
+                       MinimumActiveBackends(CommitSiblings))
+                       pg_usleep(CommitDelay);
+
+               /* try to write/flush later additions to XLOG as well */
+               if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
                 {
-                       /* try to write/flush later additions to XLOG as well */
-                       if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
-                       {
-                               XLogCtlInsert *Insert = &XLogCtl->Insert;
-                               uint32          freespace = INSERT_FREESPACE(Insert);
+                       XLogCtlInsert *Insert = &XLogCtl->Insert;
+                       uint32          freespace = INSERT_FREESPACE(Insert);
  
-                               if (freespace == 0)             /* buffer is full */
-                                       WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
-                               else
-                               {
-                                       WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
-                                       WriteRqstPtr -= freespace;
-                               }
-                               LWLockRelease(WALInsertLock);
-                               WriteRqst.Write = WriteRqstPtr;
-                               WriteRqst.Flush = WriteRqstPtr;
-                       }
+                       if (freespace == 0)             /* buffer is full */
+                               WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
                         else
                         {
-                               WriteRqst.Write = WriteRqstPtr;
-                               WriteRqst.Flush = record;
+                               WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
+                               WriteRqstPtr -= freespace;
                         }
-                       XLogWrite(WriteRqst, false, false);
+                       LWLockRelease(WALInsertLock);
+                       WriteRqst.Write = WriteRqstPtr;
+                       WriteRqst.Flush = WriteRqstPtr;
                 }
+               else
+               {
+                       WriteRqst.Write = WriteRqstPtr;
+                       WriteRqst.Flush = record;
+               }
+               XLogWrite(WriteRqst, false, false);
+
                 LWLockRelease(WALWriteLock);
                 /* done */
                 break;
author	Robert Haas <rhaas@postgresql.org>
	Mon, 2 Jul 2012 14:26:31 +0000 (10:26 -0400)
committer	Robert Haas <rhaas@postgresql.org>
	Mon, 2 Jul 2012 14:26:31 +0000 (10:26 -0400)
doc/src/sgml/config.sgml		patch \| blob \| history
doc/src/sgml/wal.sgml		patch \| blob \| history
src/backend/access/transam/xact.c		patch \| blob \| history
src/backend/access/transam/xlog.c		patch \| blob \| history