mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-18 18:44:06 +08:00
Replace buffer I/O locks with condition variables.
1. Backends waiting for buffer I/O are now interruptible. 2. If something goes wrong in a backend that is currently performing I/O, waiting backends no longer wake up until that backend reaches AbortBufferIO() and broadcasts on the CV. Previously, any waiters would wake up (because the I/O lock was automatically released) and then busy-loop until AbortBufferIO() cleared BM_IO_IN_PROGRESS. 3. LWLockMinimallyPadded is removed, as it would now be unused. Author: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Thomas Munro <thomas.munro@gmail.com> Reviewed-by: Julien Rouhaud <rjuju123@gmail.com> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> (earlier version, 2016) Discussion: https://postgr.es/m/CA%2BhUKGJ8nBFrjLuCTuqKN0pd2PQOwj9b_jnsiGFFMDvUxahj_A%40mail.gmail.com Discussion: https://postgr.es/m/CA+Tgmoaj2aPti0yho7FeEf2qt-JgQPRWb0gci_o1Hfr=C56Xng@mail.gmail.com
This commit is contained in:
parent
c3ffe34863
commit
d87251048a
@ -1586,6 +1586,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
|
||||
<entry>Waiting for the page number needed to continue a parallel B-tree
|
||||
scan to become available.</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry><literal>BufferIO</literal></entry>
|
||||
<entry>Waiting for buffer I/O to complete.</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry><literal>CheckpointDone</literal></entry>
|
||||
<entry>Waiting for a checkpoint to complete.</entry>
|
||||
@ -1876,10 +1880,6 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
|
||||
<entry><literal>BufferContent</literal></entry>
|
||||
<entry>Waiting to access a data page in memory.</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry><literal>BufferIO</literal></entry>
|
||||
<entry>Waiting for I/O on a data page.</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry><literal>BufferMapping</literal></entry>
|
||||
<entry>Waiting to associate a data block with a buffer in the buffer
|
||||
|
@ -4010,6 +4010,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
|
||||
case WAIT_EVENT_BTREE_PAGE:
|
||||
event_name = "BtreePage";
|
||||
break;
|
||||
case WAIT_EVENT_BUFFER_IO:
|
||||
event_name = "BufferIO";
|
||||
break;
|
||||
case WAIT_EVENT_CHECKPOINT_DONE:
|
||||
event_name = "CheckpointDone";
|
||||
break;
|
||||
|
@ -145,14 +145,11 @@ held within the buffer. Each buffer header also contains an LWLock, the
|
||||
"buffer content lock", that *does* represent the right to access the data
|
||||
in the buffer. It is used per the rules above.
|
||||
|
||||
There is yet another set of per-buffer LWLocks, the io_in_progress locks,
|
||||
that are used to wait for I/O on a buffer to complete. The process doing
|
||||
a read or write takes exclusive lock for the duration, and processes that
|
||||
need to wait for completion try to take shared locks (which they release
|
||||
immediately upon obtaining). XXX on systems where an LWLock represents
|
||||
nontrivial resources, it's fairly annoying to need so many locks. Possibly
|
||||
we could use per-backend LWLocks instead (a buffer header would then contain
|
||||
a field to show which backend is doing its I/O).
|
||||
* The BM_IO_IN_PROGRESS flag acts as a kind of lock, used to wait for I/O on a
|
||||
buffer to complete (and in releases before 14, it was accompanied by a
|
||||
per-buffer LWLock). The process doing a read or write sets the flag for the
|
||||
duration, and processes that need to wait for it to be cleared sleep on a
|
||||
condition variable.
|
||||
|
||||
|
||||
Normal Buffer Replacement Strategy
|
||||
|
@ -19,7 +19,7 @@
|
||||
|
||||
BufferDescPadded *BufferDescriptors;
|
||||
char *BufferBlocks;
|
||||
LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
|
||||
ConditionVariableMinimallyPadded *BufferIOCVArray;
|
||||
WritebackContext BackendWritebackContext;
|
||||
CkptSortItem *CkptBufferIds;
|
||||
|
||||
@ -68,7 +68,7 @@ InitBufferPool(void)
|
||||
{
|
||||
bool foundBufs,
|
||||
foundDescs,
|
||||
foundIOLocks,
|
||||
foundIOCV,
|
||||
foundBufCkpt;
|
||||
|
||||
/* Align descriptors to a cacheline boundary. */
|
||||
@ -81,11 +81,11 @@ InitBufferPool(void)
|
||||
ShmemInitStruct("Buffer Blocks",
|
||||
NBuffers * (Size) BLCKSZ, &foundBufs);
|
||||
|
||||
/* Align lwlocks to cacheline boundary */
|
||||
BufferIOLWLockArray = (LWLockMinimallyPadded *)
|
||||
ShmemInitStruct("Buffer IO Locks",
|
||||
NBuffers * (Size) sizeof(LWLockMinimallyPadded),
|
||||
&foundIOLocks);
|
||||
/* Align condition variables to cacheline boundary. */
|
||||
BufferIOCVArray = (ConditionVariableMinimallyPadded *)
|
||||
ShmemInitStruct("Buffer IO Condition Variables",
|
||||
NBuffers * sizeof(ConditionVariableMinimallyPadded),
|
||||
&foundIOCV);
|
||||
|
||||
/*
|
||||
* The array used to sort to-be-checkpointed buffer ids is located in
|
||||
@ -98,10 +98,10 @@ InitBufferPool(void)
|
||||
ShmemInitStruct("Checkpoint BufferIds",
|
||||
NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
|
||||
|
||||
if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt)
|
||||
if (foundDescs || foundBufs || foundIOCV || foundBufCkpt)
|
||||
{
|
||||
/* should find all of these, or none of them */
|
||||
Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt);
|
||||
Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt);
|
||||
/* note: this path is only taken in EXEC_BACKEND case */
|
||||
}
|
||||
else
|
||||
@ -131,8 +131,7 @@ InitBufferPool(void)
|
||||
LWLockInitialize(BufferDescriptorGetContentLock(buf),
|
||||
LWTRANCHE_BUFFER_CONTENT);
|
||||
|
||||
LWLockInitialize(BufferDescriptorGetIOLock(buf),
|
||||
LWTRANCHE_BUFFER_IO);
|
||||
ConditionVariableInit(BufferDescriptorGetIOCV(buf));
|
||||
}
|
||||
|
||||
/* Correct last entry of linked list */
|
||||
@ -169,16 +168,9 @@ BufferShmemSize(void)
|
||||
/* size of stuff controlled by freelist.c */
|
||||
size = add_size(size, StrategyShmemSize());
|
||||
|
||||
/*
|
||||
* It would be nice to include the I/O locks in the BufferDesc, but that
|
||||
* would increase the size of a BufferDesc to more than one cache line,
|
||||
* and benchmarking has shown that keeping every BufferDesc aligned on a
|
||||
* cache line boundary is important for performance. So, instead, the
|
||||
* array of I/O locks is allocated in a separate tranche. Because those
|
||||
* locks are not highly contended, we lay out the array with minimal
|
||||
* padding.
|
||||
*/
|
||||
size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
|
||||
/* size of I/O condition variables */
|
||||
size = add_size(size, mul_size(NBuffers,
|
||||
sizeof(ConditionVariableMinimallyPadded)));
|
||||
/* to allow aligning the above */
|
||||
size = add_size(size, PG_CACHE_LINE_SIZE);
|
||||
|
||||
|
@ -1352,9 +1352,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
||||
LWLockRelease(newPartitionLock);
|
||||
|
||||
/*
|
||||
* Buffer contents are currently invalid. Try to get the io_in_progress
|
||||
* lock. If StartBufferIO returns false, then someone else managed to
|
||||
* read it before we did, so there's nothing left for BufferAlloc() to do.
|
||||
* Buffer contents are currently invalid. Try to obtain the right to
|
||||
* start I/O. If StartBufferIO returns false, then someone else managed
|
||||
* to read it before we did, so there's nothing left for BufferAlloc() to
|
||||
* do.
|
||||
*/
|
||||
if (StartBufferIO(buf, true))
|
||||
*foundPtr = false;
|
||||
@ -1777,9 +1778,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
|
||||
*/
|
||||
VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
|
||||
|
||||
/* I'd better not still hold any locks on the buffer */
|
||||
/* I'd better not still hold the buffer content lock */
|
||||
Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
|
||||
Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
|
||||
|
||||
/*
|
||||
* Decrement the shared reference count.
|
||||
@ -2742,9 +2742,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
|
||||
uint32 buf_state;
|
||||
|
||||
/*
|
||||
* Acquire the buffer's io_in_progress lock. If StartBufferIO returns
|
||||
* false, then someone else flushed the buffer before we could, so we need
|
||||
* not do anything.
|
||||
* Try to start an I/O operation. If StartBufferIO returns false, then
|
||||
* someone else flushed the buffer before we could, so we need not do
|
||||
* anything.
|
||||
*/
|
||||
if (!StartBufferIO(buf, false))
|
||||
return;
|
||||
@ -2800,7 +2800,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
|
||||
/*
|
||||
* Now it's safe to write buffer to disk. Note that no one else should
|
||||
* have been able to write it while we were busy with log flushing because
|
||||
* we have the io_in_progress lock.
|
||||
* only one process at a time can set the BM_IO_IN_PROGRESS bit.
|
||||
*/
|
||||
bufBlock = BufHdrGetBlock(buf);
|
||||
|
||||
@ -2835,7 +2835,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
|
||||
|
||||
/*
|
||||
* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
|
||||
* end the io_in_progress state.
|
||||
* end the BM_IO_IN_PROGRESS state.
|
||||
*/
|
||||
TerminateBufferIO(buf, true, 0);
|
||||
|
||||
@ -4271,7 +4271,7 @@ IsBufferCleanupOK(Buffer buffer)
|
||||
* Functions for buffer I/O handling
|
||||
*
|
||||
* Note: We assume that nested buffer I/O never occurs.
|
||||
* i.e at most one io_in_progress lock is held per proc.
|
||||
* i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
|
||||
*
|
||||
* Also note that these are used only for shared buffers, not local ones.
|
||||
*/
|
||||
@ -4282,13 +4282,9 @@ IsBufferCleanupOK(Buffer buffer)
|
||||
static void
|
||||
WaitIO(BufferDesc *buf)
|
||||
{
|
||||
/*
|
||||
* Changed to wait until there's no IO - Inoue 01/13/2000
|
||||
*
|
||||
* Note this is *necessary* because an error abort in the process doing
|
||||
* I/O could release the io_in_progress_lock prematurely. See
|
||||
* AbortBufferIO.
|
||||
*/
|
||||
ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
|
||||
|
||||
ConditionVariablePrepareToSleep(cv);
|
||||
for (;;)
|
||||
{
|
||||
uint32 buf_state;
|
||||
@ -4303,9 +4299,9 @@ WaitIO(BufferDesc *buf)
|
||||
|
||||
if (!(buf_state & BM_IO_IN_PROGRESS))
|
||||
break;
|
||||
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
|
||||
LWLockRelease(BufferDescriptorGetIOLock(buf));
|
||||
ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
|
||||
}
|
||||
ConditionVariableCancelSleep();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -4317,7 +4313,7 @@ WaitIO(BufferDesc *buf)
|
||||
* In some scenarios there are race conditions in which multiple backends
|
||||
* could attempt the same I/O operation concurrently. If someone else
|
||||
* has already started I/O on this buffer then we will block on the
|
||||
* io_in_progress lock until he's done.
|
||||
* I/O condition variable until he's done.
|
||||
*
|
||||
* Input operations are only attempted on buffers that are not BM_VALID,
|
||||
* and output operations only on buffers that are BM_VALID and BM_DIRTY,
|
||||
@ -4335,25 +4331,11 @@ StartBufferIO(BufferDesc *buf, bool forInput)
|
||||
|
||||
for (;;)
|
||||
{
|
||||
/*
|
||||
* Grab the io_in_progress lock so that other processes can wait for
|
||||
* me to finish the I/O.
|
||||
*/
|
||||
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
|
||||
|
||||
buf_state = LockBufHdr(buf);
|
||||
|
||||
if (!(buf_state & BM_IO_IN_PROGRESS))
|
||||
break;
|
||||
|
||||
/*
|
||||
* The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
|
||||
* lock isn't held is if the process doing the I/O is recovering from
|
||||
* an error (see AbortBufferIO). If that's the case, we must wait for
|
||||
* him to get unwedged.
|
||||
*/
|
||||
UnlockBufHdr(buf, buf_state);
|
||||
LWLockRelease(BufferDescriptorGetIOLock(buf));
|
||||
WaitIO(buf);
|
||||
}
|
||||
|
||||
@ -4363,7 +4345,6 @@ StartBufferIO(BufferDesc *buf, bool forInput)
|
||||
{
|
||||
/* someone else already did the I/O */
|
||||
UnlockBufHdr(buf, buf_state);
|
||||
LWLockRelease(BufferDescriptorGetIOLock(buf));
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -4381,7 +4362,6 @@ StartBufferIO(BufferDesc *buf, bool forInput)
|
||||
* (Assumptions)
|
||||
* My process is executing IO for the buffer
|
||||
* BM_IO_IN_PROGRESS bit is set for the buffer
|
||||
* We hold the buffer's io_in_progress lock
|
||||
* The buffer is Pinned
|
||||
*
|
||||
* If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
|
||||
@ -4413,7 +4393,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
|
||||
|
||||
InProgressBuf = NULL;
|
||||
|
||||
LWLockRelease(BufferDescriptorGetIOLock(buf));
|
||||
ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -4434,14 +4414,6 @@ AbortBufferIO(void)
|
||||
{
|
||||
uint32 buf_state;
|
||||
|
||||
/*
|
||||
* Since LWLockReleaseAll has already been called, we're not holding
|
||||
* the buffer's io_in_progress_lock. We have to re-acquire it so that
|
||||
* we can use TerminateBufferIO. Anyone who's executing WaitIO on the
|
||||
* buffer will be in a busy spin until we succeed in doing this.
|
||||
*/
|
||||
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
|
||||
|
||||
buf_state = LockBufHdr(buf);
|
||||
Assert(buf_state & BM_IO_IN_PROGRESS);
|
||||
if (IsForInput)
|
||||
|
@ -146,8 +146,6 @@ static const char *const BuiltinTrancheNames[] = {
|
||||
"WALInsert",
|
||||
/* LWTRANCHE_BUFFER_CONTENT: */
|
||||
"BufferContent",
|
||||
/* LWTRANCHE_BUFFER_IO: */
|
||||
"BufferIO",
|
||||
/* LWTRANCHE_REPLICATION_ORIGIN_STATE: */
|
||||
"ReplicationOriginState",
|
||||
/* LWTRANCHE_REPLICATION_SLOT_IO: */
|
||||
@ -469,8 +467,7 @@ CreateLWLocks(void)
|
||||
StaticAssertStmt(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
|
||||
"MAX_BACKENDS too big for lwlock.c");
|
||||
|
||||
StaticAssertStmt(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE &&
|
||||
sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
|
||||
StaticAssertStmt(sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
|
||||
"Miscalculated LWLock padding");
|
||||
|
||||
if (!IsUnderPostmaster)
|
||||
|
@ -971,6 +971,7 @@ typedef enum
|
||||
WAIT_EVENT_BGWORKER_SHUTDOWN,
|
||||
WAIT_EVENT_BGWORKER_STARTUP,
|
||||
WAIT_EVENT_BTREE_PAGE,
|
||||
WAIT_EVENT_BUFFER_IO,
|
||||
WAIT_EVENT_CHECKPOINT_DONE,
|
||||
WAIT_EVENT_CHECKPOINT_START,
|
||||
WAIT_EVENT_EXECUTE_GATHER,
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include "port/atomics.h"
|
||||
#include "storage/buf.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/condition_variable.h"
|
||||
#include "storage/latch.h"
|
||||
#include "storage/lwlock.h"
|
||||
#include "storage/shmem.h"
|
||||
@ -184,7 +185,6 @@ typedef struct BufferDesc
|
||||
|
||||
int wait_backend_pid; /* backend PID of pin-count waiter */
|
||||
int freeNext; /* link in freelist chain */
|
||||
|
||||
LWLock content_lock; /* to lock access to buffer contents */
|
||||
} BufferDesc;
|
||||
|
||||
@ -221,12 +221,12 @@ typedef union BufferDescPadded
|
||||
|
||||
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
|
||||
|
||||
#define BufferDescriptorGetIOLock(bdesc) \
|
||||
(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
|
||||
#define BufferDescriptorGetIOCV(bdesc) \
|
||||
(&(BufferIOCVArray[(bdesc)->buf_id]).cv)
|
||||
#define BufferDescriptorGetContentLock(bdesc) \
|
||||
((LWLock*) (&(bdesc)->content_lock))
|
||||
|
||||
extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
|
||||
extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
|
||||
|
||||
/*
|
||||
* The freeNext field is either the index of the next freelist entry,
|
||||
|
@ -31,6 +31,17 @@ typedef struct
|
||||
proclist_head wakeup; /* list of wake-able processes */
|
||||
} ConditionVariable;
|
||||
|
||||
/*
|
||||
* Pad a condition variable to a power-of-two size so that an array of
|
||||
* condition variables does not cross a cache line boundary.
|
||||
*/
|
||||
#define CV_MINIMAL_SIZE (sizeof(ConditionVariable) <= 16 ? 16 : 32)
|
||||
typedef union ConditionVariableMinimallyPadded
|
||||
{
|
||||
ConditionVariable cv;
|
||||
char pad[CV_MINIMAL_SIZE];
|
||||
} ConditionVariableMinimallyPadded;
|
||||
|
||||
/* Initialize a condition variable. */
|
||||
extern void ConditionVariableInit(ConditionVariable *cv);
|
||||
|
||||
|
@ -48,29 +48,8 @@ typedef struct LWLock
|
||||
* even more padding so that each LWLock takes up an entire cache line; this is
|
||||
* useful, for example, in the main LWLock array, where the overall number of
|
||||
* locks is small but some are heavily contended.
|
||||
*
|
||||
* When allocating a tranche that contains data other than LWLocks, it is
|
||||
* probably best to include a bare LWLock and then pad the resulting structure
|
||||
* as necessary for performance. For an array that contains only LWLocks,
|
||||
* LWLockMinimallyPadded can be used for cases where we just want to ensure
|
||||
* that we don't cross cache line boundaries within a single lock, while
|
||||
* LWLockPadded can be used for cases where we want each lock to be an entire
|
||||
* cache line.
|
||||
*
|
||||
* An LWLockMinimallyPadded might contain more than the absolute minimum amount
|
||||
* of padding required to keep a lock from crossing a cache line boundary,
|
||||
* because an unpadded LWLock will normally fit into 16 bytes. We ignore that
|
||||
* possibility when determining the minimal amount of padding. Older releases
|
||||
* had larger LWLocks, so 32 really was the minimum, and packing them in
|
||||
* tighter might hurt performance.
|
||||
*
|
||||
* LWLOCK_MINIMAL_SIZE should be 32 on basically all common platforms, but
|
||||
* because pg_atomic_uint32 is more than 4 bytes on some obscure platforms, we
|
||||
* allow for the possibility that it might be 64. Even on those platforms,
|
||||
* we probably won't exceed 32 bytes unless LOCK_DEBUG is defined.
|
||||
*/
|
||||
#define LWLOCK_PADDED_SIZE PG_CACHE_LINE_SIZE
|
||||
#define LWLOCK_MINIMAL_SIZE (sizeof(LWLock) <= 32 ? 32 : 64)
|
||||
|
||||
/* LWLock, padded to a full cache line size */
|
||||
typedef union LWLockPadded
|
||||
@ -79,13 +58,6 @@ typedef union LWLockPadded
|
||||
char pad[LWLOCK_PADDED_SIZE];
|
||||
} LWLockPadded;
|
||||
|
||||
/* LWLock, minimally padded */
|
||||
typedef union LWLockMinimallyPadded
|
||||
{
|
||||
LWLock lock;
|
||||
char pad[LWLOCK_MINIMAL_SIZE];
|
||||
} LWLockMinimallyPadded;
|
||||
|
||||
extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
|
||||
|
||||
/* struct for storing named tranche information */
|
||||
@ -202,7 +174,6 @@ typedef enum BuiltinTrancheIds
|
||||
LWTRANCHE_SERIAL_BUFFER,
|
||||
LWTRANCHE_WAL_INSERT,
|
||||
LWTRANCHE_BUFFER_CONTENT,
|
||||
LWTRANCHE_BUFFER_IO,
|
||||
LWTRANCHE_REPLICATION_ORIGIN_STATE,
|
||||
LWTRANCHE_REPLICATION_SLOT_IO,
|
||||
LWTRANCHE_LOCK_FASTPATH,
|
||||
|
@ -398,6 +398,7 @@ CompressionAlgorithm
|
||||
CompressorState
|
||||
ComputeXidHorizonsResult
|
||||
ConditionVariable
|
||||
ConditionVariableMinimallyPadded
|
||||
ConditionalStack
|
||||
ConfigData
|
||||
ConfigVariable
|
||||
|
Loading…
Reference in New Issue
Block a user