diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 074afee494..4e0492b939 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1866,23 +1866,26 @@ SET ENABLE_SEQSCAN TO OFF;
- When the commit data for a transaction is flushed to disk, any
- additional commits ready at that time are also flushed out.
commit_delay adds a time delay, set in
- microseconds, before a transaction attempts to
- flush the WAL buffer out to disk. A nonzero delay can allow more
- transactions to be committed with only one flush operation, if
- system load is high enough that additional transactions become
- ready to commit within the given interval. But the delay is
- just wasted if no other transactions become ready to
- commit. Therefore, the delay is only performed if at least
- commit_siblings other transactions are
- active at the instant that a server process has written its
- commit record.
- The default commit_delay> is zero (no delay).
- Since all pending commit data will be written at every flush
- regardless of this setting, it is rare that adding delay
- by increasing this parameter will actually improve performance.
+ microseconds, before a WAL flush is initiated. This can improve
+ group commit throughput by allowing a larger number of transactions
+ to commit via a single WAL flush, if system load is high enough
+ that additional transactions become ready to commit within the
+ given interval. However, it also increases latency by up to
+ commit_delay microseconds for each WAL
+ flush. Because the delay is just wasted if no other transactions
+ become ready to commit, it is only performed if at least
+ commit_siblings other transactions are active
+ immediately before a flush would otherwise have been initiated.
+ In PostgreSQL> releases prior to 9.3,
+ commit_delay behaved differently and was much
+ less effective: it affected only commits, rather than all WAL flushes,
+ and waited for the entire configured delay even if the WAL flush
+ was completed sooner. Beginning in PostgreSQL> 9.3,
+ the first process that becomes ready to flush waits for the configured
+ interval, while subsequent processes wait only until the leader
+ completes the flush. The default commit_delay> is zero
+ (no delay).
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index 0afb9d6af6..a98132d3f2 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -376,9 +376,7 @@
WAL to disk, in the hope that a single flush
executed by one such transaction can also serve other transactions
committing at about the same time. Setting commit_delay
- can only help when there are many concurrently committing transactions,
- and it is difficult to tune it to a value that actually helps rather
- than hurt throughput.
+ can only help when there are many concurrently committing transactions.
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 86b1afa80d..49def6abbb 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -68,9 +68,6 @@ bool XactDeferrable;
int synchronous_commit = SYNCHRONOUS_COMMIT_ON;
-int CommitDelay = 0; /* precommit delay in microseconds */
-int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
-
/*
* MyXactAccessedTempRel is set when a temporary relation is accessed.
* We don't allow PREPARE TRANSACTION in that case. (This is global
@@ -1123,22 +1120,6 @@ RecordTransactionCommit(void)
if ((wrote_xlog && synchronous_commit > SYNCHRONOUS_COMMIT_OFF) ||
forceSyncCommit || nrels > 0)
{
- /*
- * Synchronous commit case:
- *
- * Sleep before flush! So we can flush more than one commit records
- * per single fsync. (The idea is some other backend may do the
- * XLogFlush while we're sleeping. This needs work still, because on
- * most Unixen, the minimum select() delay is 10msec or more, which is
- * way too long.)
- *
- * We do not sleep if enableFsync is not turned on, nor if there are
- * fewer than CommitSiblings other backends with active transactions.
- */
- if (CommitDelay > 0 && enableFsync &&
- MinimumActiveBackends(CommitSiblings))
- pg_usleep(CommitDelay);
-
XLogFlush(XactLastRecEnd);
/*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a43e2eeaf3..6ee50d01d5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -80,6 +80,8 @@ bool fullPageWrites = true;
bool log_checkpoints = false;
int sync_method = DEFAULT_SYNC_METHOD;
int wal_level = WAL_LEVEL_MINIMAL;
+int CommitDelay = 0; /* precommit delay in microseconds */
+int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
#ifdef WAL_DEBUG
bool XLOG_DEBUG = false;
@@ -2098,34 +2100,49 @@ XLogFlush(XLogRecPtr record)
*/
continue;
}
- /* Got the lock */
- LogwrtResult = XLogCtl->LogwrtResult;
- if (!XLByteLE(record, LogwrtResult.Flush))
- {
- /* try to write/flush later additions to XLOG as well */
- if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
- {
- XLogCtlInsert *Insert = &XLogCtl->Insert;
- uint32 freespace = INSERT_FREESPACE(Insert);
- if (freespace == 0) /* buffer is full */
- WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
- else
- {
- WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
- WriteRqstPtr -= freespace;
- }
- LWLockRelease(WALInsertLock);
- WriteRqst.Write = WriteRqstPtr;
- WriteRqst.Flush = WriteRqstPtr;
- }
+ /* Got the lock; recheck whether request is satisfied */
+ LogwrtResult = XLogCtl->LogwrtResult;
+ if (XLByteLE(record, LogwrtResult.Flush))
+ break;
+
+ /*
+ * Sleep before flush! By adding a delay here, we may give further
+ * backends the opportunity to join the backlog of group commit
+ * followers; this can significantly improve transaction throughput, at
+ * the risk of increasing transaction latency.
+ *
+ * We do not sleep if enableFsync is not turned on, nor if there are
+ * fewer than CommitSiblings other backends with active transactions.
+ */
+ if (CommitDelay > 0 && enableFsync &&
+ MinimumActiveBackends(CommitSiblings))
+ pg_usleep(CommitDelay);
+
+ /* try to write/flush later additions to XLOG as well */
+ if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
+ {
+ XLogCtlInsert *Insert = &XLogCtl->Insert;
+ uint32 freespace = INSERT_FREESPACE(Insert);
+
+ if (freespace == 0) /* buffer is full */
+ WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
else
{
- WriteRqst.Write = WriteRqstPtr;
- WriteRqst.Flush = record;
+ WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
+ WriteRqstPtr -= freespace;
}
- XLogWrite(WriteRqst, false, false);
+ LWLockRelease(WALInsertLock);
+ WriteRqst.Write = WriteRqstPtr;
+ WriteRqst.Flush = WriteRqstPtr;
}
+ else
+ {
+ WriteRqst.Write = WriteRqstPtr;
+ WriteRqst.Flush = record;
+ }
+ XLogWrite(WriteRqst, false, false);
+
LWLockRelease(WALWriteLock);
/* done */
break;