mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-02-23 19:39:53 +08:00
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out, but not yet locked. If a backend pinned and locked the page in that window, it saw the zeroed page instead of the old page or new page contents, which could lead to missing rows in a result set, or errors. To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins, zeroes, and locks the page, if it's not in the buffer cache already. In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE, to avoid breaking any 3rd party extensions that might use RBM_ZERO. More importantly, this avoids renumbering the other enum values, which would cause even bigger confusion in extensions that use ReadBufferExtended, but haven't been recompiled. Backpatch to all supported versions; this has been racy since hot standby was introduced.
This commit is contained in:
parent
c2b4ed19f6
commit
861d3aa43d
@ -155,9 +155,8 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno)
|
||||
if (blkno == P_NEW)
|
||||
elog(ERROR, "hash AM does not use P_NEW");
|
||||
|
||||
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL);
|
||||
|
||||
LockBuffer(buf, HASH_WRITE);
|
||||
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
|
||||
NULL);
|
||||
|
||||
/* ref count and lock type are correct */
|
||||
|
||||
@ -198,11 +197,13 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
|
||||
if (BufferGetBlockNumber(buf) != blkno)
|
||||
elog(ERROR, "unexpected hash relation size: %u, should be %u",
|
||||
BufferGetBlockNumber(buf), blkno);
|
||||
LockBuffer(buf, HASH_WRITE);
|
||||
}
|
||||
else
|
||||
buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL);
|
||||
|
||||
LockBuffer(buf, HASH_WRITE);
|
||||
{
|
||||
buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
|
||||
NULL);
|
||||
}
|
||||
|
||||
/* ref count and lock type are correct */
|
||||
|
||||
|
@ -7070,9 +7070,8 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
|
||||
* not do anything that assumes we are touching a heap.
|
||||
*/
|
||||
buffer = XLogReadBufferExtended(xlrec->node, xlrec->forknum, xlrec->blkno,
|
||||
RBM_ZERO);
|
||||
RBM_ZERO_AND_LOCK);
|
||||
Assert(BufferIsValid(buffer));
|
||||
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
||||
page = (Page) BufferGetPage(buffer);
|
||||
|
||||
Assert(record->xl_len == SizeOfHeapNewpage + BLCKSZ);
|
||||
|
@ -3187,12 +3187,8 @@ RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
|
||||
Page page;
|
||||
|
||||
buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
|
||||
RBM_ZERO);
|
||||
get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
|
||||
Assert(BufferIsValid(buffer));
|
||||
if (get_cleanup_lock)
|
||||
LockBufferForCleanup(buffer);
|
||||
else
|
||||
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
||||
|
||||
page = (Page) BufferGetPage(buffer);
|
||||
|
||||
|
@ -258,7 +258,8 @@ XLogCheckInvalidPages(void)
|
||||
* The returned buffer is exclusively-locked.
|
||||
*
|
||||
* For historical reasons, instead of a ReadBufferMode argument, this only
|
||||
* supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
|
||||
* supports RBM_ZERO_AND_LOCK (init == true) and RBM_NORMAL (init == false)
|
||||
* modes.
|
||||
*/
|
||||
Buffer
|
||||
XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
|
||||
@ -266,8 +267,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
|
||||
Buffer buf;
|
||||
|
||||
buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
|
||||
init ? RBM_ZERO : RBM_NORMAL);
|
||||
if (BufferIsValid(buf))
|
||||
init ? RBM_ZERO_AND_LOCK : RBM_NORMAL);
|
||||
if (BufferIsValid(buf) && !init)
|
||||
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
|
||||
|
||||
return buf;
|
||||
@ -286,8 +287,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
|
||||
* dropped or truncated. If we don't see evidence of that later in the WAL
|
||||
* sequence, we'll complain at the end of WAL replay.)
|
||||
*
|
||||
* In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
|
||||
* relation is extended with all-zeroes pages up to the given block number.
|
||||
* In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
|
||||
* with all-zeroes pages up to the given block number.
|
||||
*
|
||||
* In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
|
||||
* exist, and we don't check for all-zeroes. Thus, no log entry is made
|
||||
@ -341,7 +342,11 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
|
||||
do
|
||||
{
|
||||
if (buffer != InvalidBuffer)
|
||||
{
|
||||
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
ReleaseBuffer(buffer);
|
||||
}
|
||||
buffer = ReadBufferWithoutRelcache(rnode, forknum,
|
||||
P_NEW, mode, NULL);
|
||||
}
|
||||
@ -349,6 +354,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
|
||||
/* Handle the corner case that P_NEW returns non-consecutive pages */
|
||||
if (BufferGetBlockNumber(buffer) != blkno)
|
||||
{
|
||||
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
ReleaseBuffer(buffer);
|
||||
buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
|
||||
mode, NULL);
|
||||
|
@ -214,14 +214,19 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
|
||||
* valid, the page is zeroed instead of throwing an error. This is intended
|
||||
* for non-critical data, where the caller is prepared to repair errors.
|
||||
*
|
||||
* In RBM_ZERO mode, if the page isn't in buffer cache already, it's filled
|
||||
* with zeros instead of reading it from disk. Useful when the caller is
|
||||
* going to fill the page from scratch, since this saves I/O and avoids
|
||||
* In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
|
||||
* filled with zeros instead of reading it from disk. Useful when the caller
|
||||
* is going to fill the page from scratch, since this saves I/O and avoids
|
||||
* unnecessary failure if the page-on-disk has corrupt page headers.
|
||||
* The page is returned locked to ensure that the caller has a chance to
|
||||
* initialize the page before it's made visible to others.
|
||||
* Caution: do not use this mode to read a page that is beyond the relation's
|
||||
* current physical EOF; that is likely to cause problems in md.c when
|
||||
* the page is modified and written out. P_NEW is OK, though.
|
||||
*
|
||||
* RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
|
||||
* a cleanup-strength lock on the page.
|
||||
*
|
||||
* RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
|
||||
*
|
||||
* If strategy is not NULL, a nondefault buffer access strategy is used.
|
||||
@ -363,6 +368,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
||||
isExtend,
|
||||
found);
|
||||
|
||||
/*
|
||||
* In RBM_ZERO_AND_LOCK mode, the caller expects the buffer to
|
||||
* be already locked on return.
|
||||
*/
|
||||
if (!isLocalBuf)
|
||||
{
|
||||
if (mode == RBM_ZERO_AND_LOCK)
|
||||
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
|
||||
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
||||
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
|
||||
}
|
||||
|
||||
return BufferDescriptorGetBuffer(bufHdr);
|
||||
}
|
||||
|
||||
@ -444,8 +461,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
||||
* Read in the page, unless the caller intends to overwrite it and
|
||||
* just wants us to allocate a buffer.
|
||||
*/
|
||||
if (mode == RBM_ZERO)
|
||||
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK ||
|
||||
mode == RBM_DO_NOT_USE)
|
||||
{
|
||||
MemSet((char *) bufBlock, 0, BLCKSZ);
|
||||
}
|
||||
else
|
||||
{
|
||||
instr_time io_start,
|
||||
@ -486,6 +506,19 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
|
||||
* the page as valid, to make sure that no other backend sees the zeroed
|
||||
* page before the caller has had a chance to initialize it.
|
||||
*
|
||||
* Since no-one else can be looking at the page contents yet, there is no
|
||||
* difference between an exclusive lock and a cleanup-strength lock.
|
||||
* (Note that we cannot use LockBuffer() of LockBufferForCleanup() here,
|
||||
* because they assert that the buffer is already valid.)
|
||||
*/
|
||||
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
||||
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
|
||||
|
||||
if (isLocalBuf)
|
||||
{
|
||||
/* Only need to adjust flags */
|
||||
|
@ -36,11 +36,16 @@ typedef enum BufferAccessStrategyType
|
||||
typedef enum
|
||||
{
|
||||
RBM_NORMAL, /* Normal read */
|
||||
RBM_ZERO, /* Don't read from disk, caller will
|
||||
* initialize */
|
||||
RBM_DO_NOT_USE, /* This used to be RBM_ZERO. Only kept for
|
||||
* binary compatibility with 3rd party
|
||||
* extensions. */
|
||||
RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
|
||||
RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL
|
||||
RBM_NORMAL_NO_LOG, /* Don't log page as invalid during WAL
|
||||
* replay; otherwise same as RBM_NORMAL */
|
||||
RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will
|
||||
* initialize. Also locks the page. */
|
||||
RBM_ZERO_AND_CLEANUP_LOCK /* Like RBM_ZERO_AND_LOCK, but locks the page
|
||||
* in "cleanup" mode */
|
||||
} ReadBufferMode;
|
||||
|
||||
/* in globals.c ... this duplicates miscadmin.h */
|
||||
|
Loading…
Reference in New Issue
Block a user