diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 0fcde95ccd..6197a60c97 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -57,8 +57,8 @@ typedef struct IndexBulkDeleteCallback callback; void *callback_state; BTCycleId cycleid; - BlockNumber lastBlockVacuumed; /* last blkno reached by Vacuum scan */ - BlockNumber lastUsedPage; /* blkno of last non-recyclable page */ + BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */ + BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */ BlockNumber totFreePages; /* true total # of free pages */ MemoryContext pagedelcontext; } BTVacState; @@ -631,7 +631,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.callback_state = callback_state; vstate.cycleid = cycleid; vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ - vstate.lastUsedPage = BTREE_METAPAGE; + vstate.lastBlockLocked = BTREE_METAPAGE; vstate.totFreePages = 0; /* Create a temporary memory context to run _bt_pagedel in */ @@ -687,27 +687,30 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, } /* - * InHotStandby we need to scan right up to the end of the index for - * correct locking, so we may need to write a WAL record for the final - * block in the index if it was not vacuumed. It's possible that VACUUMing - * has actually removed zeroed pages at the end of the index so we need to - * take care to issue the record for last actual block and not for the - * last block that was scanned. Ignore empty indexes. + * If the WAL is replayed in hot standby, the replay process needs to get + * cleanup locks on all index leaf pages, just as we've been doing here. + * However, we won't issue any WAL records about pages that have no items + * to be deleted. For pages between pages we've vacuumed, the replay code + * will take locks under the direction of the lastBlockVacuumed fields in + * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one + * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record + * against the last leaf page in the index, if that one wasn't vacuumed. */ if (XLogStandbyInfoActive() && - num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1)) + vstate.lastBlockVacuumed < vstate.lastBlockLocked) { Buffer buf; /* - * We can't use _bt_getbuf() here because it always applies - * _bt_checkpage(), which will barf on an all-zero page. We want to - * recycle all-zero pages, not fail. Also, we want to use a - * nondefault buffer access strategy. + * The page should be valid, but we can't use _bt_getbuf() because we + * want to use a nondefault buffer access strategy. Since we aren't + * going to delete any items, getting cleanup lock again is probably + * overkill, but for consistency do that anyway. */ - buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL, - info->strategy); + buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked, + RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); + _bt_checkpage(rel, buf); _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); _bt_relbuf(rel, buf); } @@ -782,10 +785,6 @@ restart: } } - /* If the page is in use, update lastUsedPage */ - if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno) - vstate->lastUsedPage = blkno; - /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { @@ -821,6 +820,13 @@ restart: LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); + /* + * Remember highest leaf page number we've taken cleanup lock on; see + * notes in btvacuumscan + */ + if (blkno > vstate->lastBlockLocked) + vstate->lastBlockLocked = blkno; + /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started @@ -887,18 +893,26 @@ restart: */ if (ndeletable > 0) { - BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf); - - _bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed); + /* + * Notice that the issued XLOG_BTREE_VACUUM WAL record includes an + * instruction to the replay code to get cleanup lock on all pages + * between the previous lastBlockVacuumed and this page. This + * ensures that WAL replay locks all leaf pages at some point. + * + * Since we can visit leaf pages out-of-order when recursing, + * replay might end up locking such pages an extra time, but it + * doesn't seem worth the amount of bookkeeping it'd take to avoid + * that. + */ + _bt_delitems_vacuum(rel, buf, deletable, ndeletable, + vstate->lastBlockVacuumed); /* - * Keep track of the block number of the lastBlockVacuumed, so we - * can scan those blocks as well during WAL replay. This then - * provides concurrency protection and allows btrees to be used - * while in recovery. + * Remember highest leaf page number we've issued a + * XLOG_BTREE_VACUUM WAL record for. */ - if (lastBlockVacuumed > vstate->lastBlockVacuumed) - vstate->lastBlockVacuumed = lastBlockVacuumed; + if (blkno > vstate->lastBlockVacuumed) + vstate->lastBlockVacuumed = blkno; stats->tuples_removed += ndeletable; /* must recompute maxoff */ diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index d8f306b3a6..f8670bfb30 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -489,28 +489,47 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) BTPageOpaque opaque; /* - * If queries might be active then we need to ensure every block is + * If queries might be active then we need to ensure every leaf page is * unpinned between the lastBlockVacuumed and the current block, if there - * are any. This ensures that every block in the index is touched during - * VACUUM as required to ensure scans work correctly. + * are any. This prevents replay of the VACUUM from reaching the stage of + * removing heap tuples while there could still be indexscans "in flight" + * to those particular tuples (see nbtree/README). + * + * It might be worth checking if there are actually any backends running; + * if not, we could just skip this. + * + * Since VACUUM can visit leaf pages out-of-order, it might issue records + * with lastBlockVacuumed >= block; that's not an error, it just means + * nothing to do now. + * + * Note: since we touch all pages in the range, we will lock non-leaf + * pages, and also any empty (all-zero) pages that may be in the index. It + * doesn't seem worth the complexity to avoid that. But it's important + * that HotStandbyActiveInReplay() will not return true if the database + * isn't yet consistent; so we need not fear reading still-corrupt blocks + * here during crash recovery. */ - if (standbyState == STANDBY_SNAPSHOT_READY && - (xlrec->lastBlockVacuumed + 1) != xlrec->block) + if (HotStandbyActiveInReplay()) { - BlockNumber blkno = xlrec->lastBlockVacuumed + 1; + BlockNumber blkno; - for (; blkno < xlrec->block; blkno++) + for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++) { /* + * We use RBM_NORMAL_NO_LOG mode because it's not an error + * condition to see all-zero pages. The original btvacuumpage + * scan would have skipped over all-zero pages, noting them in FSM + * but not bothering to initialize them just yet; so we mustn't + * throw an error here. (We could skip acquiring the cleanup lock + * if PageIsNew, but it's probably not worth the cycles to test.) + * * XXX we don't actually need to read the block, we just need to * confirm it is unpinned. If we had a special call into the * buffer manager we could optimise this so that if the block is * not in shared_buffers we confirm it as unpinned. - * - * Another simple optimization would be to check if there's any - * backends running; if not, we could just skip this. */ - buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, RBM_NORMAL); + buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, + RBM_NORMAL_NO_LOG); if (BufferIsValid(buffer)) { LockBufferForCleanup(buffer); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index c44bc235a2..7424201c43 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -159,6 +159,9 @@ static XLogRecPtr LastRec; */ static bool LocalRecoveryInProgress = true; +/* True if we're allowing hot standby backends, but only in startup process */ +static bool backendsAllowed = false; + /* * Local state for XLogInsertAllowed(): * 1: unconditionally allowed to insert XLOG @@ -6654,8 +6657,6 @@ StartupXLOG(void) static void CheckRecoveryConsistency(void) { - static bool backendsAllowed = false; - /* * During crash recovery, we don't reach a consistent state until we've * replayed all the WAL. @@ -6734,6 +6735,16 @@ RecoveryInProgress(void) } } +/* + * Like HotStandbyActive(), but to be used only in WAL replay code, + * where we don't need to ask any other process what the state is. + */ +bool +HotStandbyActiveInReplay(void) +{ + return backendsAllowed; +} + /* * Is this process allowed to insert new WAL records? * diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 3d7c7cf69e..a059b60d1a 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -264,6 +264,10 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) * * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the * relation is extended with all-zeroes pages up to the given block number. + * + * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't + * exist, and we don't check for all-zeroes. Thus, no log entry is made + * to imply that the page should be dropped or truncated later. */ Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, @@ -304,6 +308,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, log_invalid_page(rnode, forknum, blkno, false); return InvalidBuffer; } + if (mode == RBM_NORMAL_NO_LOG) + return InvalidBuffer; /* OK to extend the file */ /* we do this in recovery only - no rel-extension lock needed */ Assert(InRecovery); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 68fee2dff6..6e964da013 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -197,7 +197,8 @@ ReadBuffer(Relation reln, BlockNumber blockNum) * Assume when this function is called, that reln has been opened already. * * In RBM_NORMAL mode, the page is read from disk, and the page header is - * validated. An error is thrown if the page header is not valid. + * validated. An error is thrown if the page header is not valid. (But + * note that an all-zero page is considered "valid"; see PageIsVerified().) * * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not * valid, the page is zeroed instead of throwing an error. This is intended @@ -211,6 +212,8 @@ ReadBuffer(Relation reln, BlockNumber blockNum) * current physical EOF; that is likely to cause problems in md.c when * the page is modified and written out. P_NEW is OK, though. * + * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here. + * * If strategy is not NULL, a nondefault buffer access strategy is used. * See buffer/README for details. */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 78522a52a6..308c6a7c81 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -286,6 +286,7 @@ extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec); extern void issue_xlog_fsync(int fd, uint32 log, uint32 seg); extern bool RecoveryInProgress(void); +extern bool HotStandbyActiveInReplay(void); extern bool XLogInsertAllowed(void); extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index dc4376ee9a..4b949dcaa4 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -38,7 +38,9 @@ typedef enum RBM_NORMAL, /* Normal read */ RBM_ZERO, /* Don't read from disk, caller will * initialize */ - RBM_ZERO_ON_ERROR /* Read, but return an all-zeros page on error */ + RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */ + RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL + * replay; otherwise same as RBM_NORMAL */ } ReadBufferMode; /* in globals.c ... this duplicates miscadmin.h */