]> granicus.if.org Git - postgresql/commitdiff
Make commit_delay much smarter.
authorRobert Haas <rhaas@postgresql.org>
Mon, 2 Jul 2012 14:26:31 +0000 (10:26 -0400)
committerRobert Haas <rhaas@postgresql.org>
Mon, 2 Jul 2012 14:26:31 +0000 (10:26 -0400)
Instead of letting every backend participating in a group commit wait
independently, have the first one that becomes ready to flush WAL wait
for the configured delay, and let all the others wait just long enough
for that first process to complete its flush.  This greatly increases
the chances of being able to configure a commit_delay setting that
actually improves performance.

As a side consequence of this change, commit_delay now affects all WAL
flushes, rather than just commits.  There was some discussion on
pgsql-hackers about whether to rename the GUC to, say, wal_flush_delay,
but in the absence of consensus I am leaving it alone for now.

Peter Geoghegan, with some changes, mostly to the documentation, by me.

doc/src/sgml/config.sgml
doc/src/sgml/wal.sgml
src/backend/access/transam/xact.c
src/backend/access/transam/xlog.c

index 074afee494eca127d31617551676bbebf74906fd..4e0492b939399a167df87d448560d5b1712386bb 100644 (file)
@@ -1866,23 +1866,26 @@ SET ENABLE_SEQSCAN TO OFF;
       </indexterm>
       <listitem>
        <para>
-        When the commit data for a transaction is flushed to disk, any
-        additional commits ready at that time are also flushed out.
         <varname>commit_delay</varname> adds a time delay, set in
-        microseconds, before a transaction attempts to
-        flush the WAL buffer out to disk.  A nonzero delay can allow more
-        transactions to be committed with only one flush operation, if
-        system load is high enough that additional transactions become
-        ready to commit within the given interval. But the delay is
-        just wasted if no other transactions become ready to
-        commit. Therefore, the delay is only performed if at least
-        <varname>commit_siblings</varname> other transactions are
-        active at the instant that a server process has written its
-        commit record.
-        The default <varname>commit_delay</> is zero (no delay).
-        Since all pending commit data will be written at every flush
-        regardless of this setting, it is rare that adding delay
-        by increasing this parameter will actually improve performance.
+        microseconds, before a WAL flush is initiated.  This can improve
+        group commit throughput by allowing a larger number of transactions
+        to commit via a single WAL flush, if system load is high enough
+        that additional transactions become ready to commit within the
+        given interval.  However, it also increases latency by up to
+        <varname>commit_delay</varname> microseconds for each WAL
+        flush.  Because the delay is just wasted if no other transactions
+        become ready to commit, it is only performed if at least
+        <varname>commit_siblings</varname> other transactions are active
+        immediately before a flush would otherwise have been initiated.
+        In <productname>PostgreSQL</> releases prior to 9.3,
+        <varname>commit_delay</varname> behaved differently and was much
+        less effective: it affected only commits, rather than all WAL flushes,
+        and waited for the entire configured delay even if the WAL flush
+        was completed sooner.  Beginning in <productname>PostgreSQL</> 9.3, 
+        the first process that becomes ready to flush waits for the configured
+        interval, while subsequent processes wait only until the leader
+        completes the flush.  The default <varname>commit_delay</> is zero
+        (no delay).
        </para>
       </listitem>
      </varlistentry>
index 0afb9d6af600da01c943534f0089f97dc311c8d4..a98132d3f2a1f8c16d3a15d26d5d6251d87ca3cb 100644 (file)
    <acronym>WAL</acronym> to disk, in the hope that a single flush
    executed by one such transaction can also serve other transactions
    committing at about the same time.  Setting <varname>commit_delay</varname>
-   can only help when there are many concurrently committing transactions,
-   and it is difficult to tune it to a value that actually helps rather
-   than hurt throughput.
+   can only help when there are many concurrently committing transactions.
   </para>
 
  </sect1>
index 86b1afa80d9330bcf5ac5adb223b7ebc00c08f53..49def6abbb66a2738c256dc6ae962f2837676eea 100644 (file)
@@ -68,9 +68,6 @@ bool          XactDeferrable;
 
 int                    synchronous_commit = SYNCHRONOUS_COMMIT_ON;
 
-int                    CommitDelay = 0;        /* precommit delay in microseconds */
-int                    CommitSiblings = 5; /* # concurrent xacts needed to sleep */
-
 /*
  * MyXactAccessedTempRel is set when a temporary relation is accessed.
  * We don't allow PREPARE TRANSACTION in that case.  (This is global
@@ -1123,22 +1120,6 @@ RecordTransactionCommit(void)
        if ((wrote_xlog && synchronous_commit > SYNCHRONOUS_COMMIT_OFF) ||
                forceSyncCommit || nrels > 0)
        {
-               /*
-                * Synchronous commit case:
-                *
-                * Sleep before flush! So we can flush more than one commit records
-                * per single fsync.  (The idea is some other backend may do the
-                * XLogFlush while we're sleeping.  This needs work still, because on
-                * most Unixen, the minimum select() delay is 10msec or more, which is
-                * way too long.)
-                *
-                * We do not sleep if enableFsync is not turned on, nor if there are
-                * fewer than CommitSiblings other backends with active transactions.
-                */
-               if (CommitDelay > 0 && enableFsync &&
-                       MinimumActiveBackends(CommitSiblings))
-                       pg_usleep(CommitDelay);
-
                XLogFlush(XactLastRecEnd);
 
                /*
index a43e2eeaf306eb15146abf8e7a253fa9f38cdb50..6ee50d01d52a1abf892790841c07d646929ebb32 100644 (file)
@@ -80,6 +80,8 @@ bool          fullPageWrites = true;
 bool           log_checkpoints = false;
 int                    sync_method = DEFAULT_SYNC_METHOD;
 int                    wal_level = WAL_LEVEL_MINIMAL;
+int                    CommitDelay = 0;        /* precommit delay in microseconds */
+int                    CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 
 #ifdef WAL_DEBUG
 bool           XLOG_DEBUG = false;
@@ -2098,34 +2100,49 @@ XLogFlush(XLogRecPtr record)
                         */
                        continue;
                }
-               /* Got the lock */
+
+               /* Got the lock; recheck whether request is satisfied */
                LogwrtResult = XLogCtl->LogwrtResult;
-               if (!XLByteLE(record, LogwrtResult.Flush))
+               if (XLByteLE(record, LogwrtResult.Flush))
+                       break;
+
+               /*
+                * Sleep before flush! By adding a delay here, we may give further
+                * backends the opportunity to join the backlog of group commit
+                * followers; this can significantly improve transaction throughput, at
+                * the risk of increasing transaction latency.
+                *
+                * We do not sleep if enableFsync is not turned on, nor if there are
+                * fewer than CommitSiblings other backends with active transactions.
+                */
+               if (CommitDelay > 0 && enableFsync &&
+                       MinimumActiveBackends(CommitSiblings))
+                       pg_usleep(CommitDelay);
+
+               /* try to write/flush later additions to XLOG as well */
+               if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
                {
-                       /* try to write/flush later additions to XLOG as well */
-                       if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
-                       {
-                               XLogCtlInsert *Insert = &XLogCtl->Insert;
-                               uint32          freespace = INSERT_FREESPACE(Insert);
+                       XLogCtlInsert *Insert = &XLogCtl->Insert;
+                       uint32          freespace = INSERT_FREESPACE(Insert);
 
-                               if (freespace == 0)             /* buffer is full */
-                                       WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
-                               else
-                               {
-                                       WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
-                                       WriteRqstPtr -= freespace;
-                               }
-                               LWLockRelease(WALInsertLock);
-                               WriteRqst.Write = WriteRqstPtr;
-                               WriteRqst.Flush = WriteRqstPtr;
-                       }
+                       if (freespace == 0)             /* buffer is full */
+                               WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
                        else
                        {
-                               WriteRqst.Write = WriteRqstPtr;
-                               WriteRqst.Flush = record;
+                               WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
+                               WriteRqstPtr -= freespace;
                        }
-                       XLogWrite(WriteRqst, false, false);
+                       LWLockRelease(WALInsertLock);
+                       WriteRqst.Write = WriteRqstPtr;
+                       WriteRqst.Flush = WriteRqstPtr;
                }
+               else
+               {
+                       WriteRqst.Write = WriteRqstPtr;
+                       WriteRqst.Flush = record;
+               }
+               XLogWrite(WriteRqst, false, false);
+
                LWLockRelease(WALWriteLock);
                /* done */
                break;