diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 074afee494..4e0492b939 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1866,23 +1866,26 @@ SET ENABLE_SEQSCAN TO OFF; - When the commit data for a transaction is flushed to disk, any - additional commits ready at that time are also flushed out. commit_delay adds a time delay, set in - microseconds, before a transaction attempts to - flush the WAL buffer out to disk. A nonzero delay can allow more - transactions to be committed with only one flush operation, if - system load is high enough that additional transactions become - ready to commit within the given interval. But the delay is - just wasted if no other transactions become ready to - commit. Therefore, the delay is only performed if at least - commit_siblings other transactions are - active at the instant that a server process has written its - commit record. - The default commit_delay is zero (no delay). - Since all pending commit data will be written at every flush - regardless of this setting, it is rare that adding delay - by increasing this parameter will actually improve performance. + microseconds, before a WAL flush is initiated. This can improve + group commit throughput by allowing a larger number of transactions + to commit via a single WAL flush, if system load is high enough + that additional transactions become ready to commit within the + given interval. However, it also increases latency by up to + commit_delay microseconds for each WAL + flush. Because the delay is just wasted if no other transactions + become ready to commit, it is only performed if at least + commit_siblings other transactions are active + immediately before a flush would otherwise have been initiated. + In PostgreSQL releases prior to 9.3, + commit_delay behaved differently and was much + less effective: it affected only commits, rather than all WAL flushes, + and waited for the entire configured delay even if the WAL flush + was completed sooner. Beginning in PostgreSQL 9.3, + the first process that becomes ready to flush waits for the configured + interval, while subsequent processes wait only until the leader + completes the flush. The default commit_delay is zero + (no delay). diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index 0afb9d6af6..a98132d3f2 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -376,9 +376,7 @@ WAL to disk, in the hope that a single flush executed by one such transaction can also serve other transactions committing at about the same time. Setting commit_delay - can only help when there are many concurrently committing transactions, - and it is difficult to tune it to a value that actually helps rather - than hurt throughput. + can only help when there are many concurrently committing transactions. diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 86b1afa80d..49def6abbb 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -68,9 +68,6 @@ bool XactDeferrable; int synchronous_commit = SYNCHRONOUS_COMMIT_ON; -int CommitDelay = 0; /* precommit delay in microseconds */ -int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ - /* * MyXactAccessedTempRel is set when a temporary relation is accessed. * We don't allow PREPARE TRANSACTION in that case. (This is global @@ -1123,22 +1120,6 @@ RecordTransactionCommit(void) if ((wrote_xlog && synchronous_commit > SYNCHRONOUS_COMMIT_OFF) || forceSyncCommit || nrels > 0) { - /* - * Synchronous commit case: - * - * Sleep before flush! So we can flush more than one commit records - * per single fsync. (The idea is some other backend may do the - * XLogFlush while we're sleeping. This needs work still, because on - * most Unixen, the minimum select() delay is 10msec or more, which is - * way too long.) - * - * We do not sleep if enableFsync is not turned on, nor if there are - * fewer than CommitSiblings other backends with active transactions. - */ - if (CommitDelay > 0 && enableFsync && - MinimumActiveBackends(CommitSiblings)) - pg_usleep(CommitDelay); - XLogFlush(XactLastRecEnd); /* diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a43e2eeaf3..6ee50d01d5 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -80,6 +80,8 @@ bool fullPageWrites = true; bool log_checkpoints = false; int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; +int CommitDelay = 0; /* precommit delay in microseconds */ +int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -2098,34 +2100,49 @@ XLogFlush(XLogRecPtr record) */ continue; } - /* Got the lock */ - LogwrtResult = XLogCtl->LogwrtResult; - if (!XLByteLE(record, LogwrtResult.Flush)) - { - /* try to write/flush later additions to XLOG as well */ - if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE)) - { - XLogCtlInsert *Insert = &XLogCtl->Insert; - uint32 freespace = INSERT_FREESPACE(Insert); - if (freespace == 0) /* buffer is full */ - WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx]; - else - { - WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx]; - WriteRqstPtr -= freespace; - } - LWLockRelease(WALInsertLock); - WriteRqst.Write = WriteRqstPtr; - WriteRqst.Flush = WriteRqstPtr; - } + /* Got the lock; recheck whether request is satisfied */ + LogwrtResult = XLogCtl->LogwrtResult; + if (XLByteLE(record, LogwrtResult.Flush)) + break; + + /* + * Sleep before flush! By adding a delay here, we may give further + * backends the opportunity to join the backlog of group commit + * followers; this can significantly improve transaction throughput, at + * the risk of increasing transaction latency. + * + * We do not sleep if enableFsync is not turned on, nor if there are + * fewer than CommitSiblings other backends with active transactions. + */ + if (CommitDelay > 0 && enableFsync && + MinimumActiveBackends(CommitSiblings)) + pg_usleep(CommitDelay); + + /* try to write/flush later additions to XLOG as well */ + if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE)) + { + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint32 freespace = INSERT_FREESPACE(Insert); + + if (freespace == 0) /* buffer is full */ + WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx]; else { - WriteRqst.Write = WriteRqstPtr; - WriteRqst.Flush = record; + WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx]; + WriteRqstPtr -= freespace; } - XLogWrite(WriteRqst, false, false); + LWLockRelease(WALInsertLock); + WriteRqst.Write = WriteRqstPtr; + WriteRqst.Flush = WriteRqstPtr; } + else + { + WriteRqst.Write = WriteRqstPtr; + WriteRqst.Flush = record; + } + XLogWrite(WriteRqst, false, false); + LWLockRelease(WALWriteLock); /* done */ break;