diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 7055c242d20..3c703948448 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -520,6 +520,30 @@ normal running after recovery has completed. This is a key capability because it allows running applications to continue while the standby changes state into a normally running server. +The interlocking required to avoid returning incorrect results from +MVCC scans is not required on standby nodes. That is because +HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesSelf(), +HeapTupleSatisfiesDirty() and HeapTupleSatisfiesVacuum() are only +ever used during write transactions, which cannot exist on the standby. +This leaves HeapTupleSatisfiesMVCC() and HeapTupleSatisfiesToast(), so +HeapTupleSatisfiesToast() is the only non-MVCC scan type used on standbys. +There is one minor exception, which is that the optimizer sometimes +looks at the boundaries of value ranges using SnapshotDirty, which +could result in returning a newer value for query statistics; this +would affect the query plan in rare cases, but not the correctness. +The risk window is small since the stats look at the min and max values +in the index, so the scan retrieves a tid then immediately uses it +to look in the heap. It is unlikely that the tid could have been +deleted, vacuumed and re-inserted in the time taken to look in the heap +via direct tid access. So we ignore that scan type as a problem. +This means if we re-check the results of any scan of a toast index we +will be able to completely avoid performing the "pin scan" operation +during replay of VACUUM WAL records. + +XXX FIXME: Toast re-checks are not yet added, so we still perform the +pin scan when replaying vacuum records of toast indexes. + + Other Things That Are Handy to Know ----------------------------------- diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 712385b3bfc..752e3b5dd12 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -22,6 +22,7 @@ #include "access/relscan.h" #include "access/xlog.h" #include "catalog/index.h" +#include "catalog/pg_namespace.h" #include "commands/vacuum.h" #include "storage/indexfsm.h" #include "storage/ipc.h" @@ -823,6 +824,11 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, } /* + * Check to see if we need to issue one final WAL record for this index, + * which may be needed for correctness on a hot standby node when + * non-MVCC index scans could take place. This now only occurs when we + * perform a TOAST scan, so only occurs for TOAST indexes. + * * If the WAL is replayed in hot standby, the replay process needs to get * cleanup locks on all index leaf pages, just as we've been doing here. * However, we won't issue any WAL records about pages that have no items @@ -833,6 +839,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, * against the last leaf page in the index, if that one wasn't vacuumed. */ if (XLogStandbyInfoActive() && + rel->rd_rel->relnamespace == PG_TOAST_NAMESPACE && vstate.lastBlockVacuumed < vstate.lastBlockLocked) { Buffer buf; @@ -1031,6 +1038,20 @@ restart: */ if (ndeletable > 0) { + BlockNumber lastBlockVacuumed = InvalidBlockNumber; + + /* + * We may need to record the lastBlockVacuumed for use when + * non-MVCC scans might be performed on the index on a + * hot standby. See explanation in btree_xlog_vacuum(). + * + * On a hot standby, a non-MVCC scan can only take place + * when we access a Toast Index, so we need only record + * the lastBlockVacuumed if we are vacuuming a Toast Index. + */ + if (rel->rd_rel->relnamespace == PG_TOAST_NAMESPACE) + lastBlockVacuumed = vstate->lastBlockVacuumed; + /* * Notice that the issued XLOG_BTREE_VACUUM WAL record includes an * instruction to the replay code to get cleanup lock on all pages @@ -1043,7 +1064,7 @@ restart: * that. */ _bt_delitems_vacuum(rel, buf, deletable, ndeletable, - vstate->lastBlockVacuumed); + lastBlockVacuumed); /* * Remember highest leaf page number we've issued a diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index bba4840da05..0d094ca7faa 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -391,6 +391,19 @@ btree_xlog_vacuum(XLogReaderState *record) BTPageOpaque opaque; /* + * If we are running non-MVCC scans using this index we need to do some + * additional work to ensure correctness, which is known as a "pin scan" + * described in more detail in next paragraphs. We used to do the extra + * work in all cases, whereas we now avoid that work except when the index + * is a toast index, since toast scans aren't fully MVCC compliant. + * If lastBlockVacuumed is set to InvalidBlockNumber then we skip the + * additional work required for the pin scan. + * + * Avoiding this extra work is important since it requires us to touch + * every page in the index, so is an O(N) operation. Worse, it is an + * operation performed in the foreground during redo, so it delays + * replication directly. + * * If queries might be active then we need to ensure every leaf page is * unpinned between the lastBlockVacuumed and the current block, if there * are any. This prevents replay of the VACUUM from reaching the stage of @@ -412,7 +425,7 @@ btree_xlog_vacuum(XLogReaderState *record) * isn't yet consistent; so we need not fear reading still-corrupt blocks * here during crash recovery. */ - if (HotStandbyActiveInReplay()) + if (HotStandbyActiveInReplay() && BlockNumberIsValid(xlrec->lastBlockVacuumed)) { RelFileNode thisrnode; BlockNumber thisblkno; @@ -433,7 +446,8 @@ btree_xlog_vacuum(XLogReaderState *record) * XXX we don't actually need to read the block, we just need to * confirm it is unpinned. If we had a special call into the * buffer manager we could optimise this so that if the block is - * not in shared_buffers we confirm it as unpinned. + * not in shared_buffers we confirm it as unpinned. Optimizing + * this is now moot, since in most cases we avoid the scan. */ buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, RBM_NORMAL_NO_LOG); diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 7631cb5c73f..68afc2e09bd 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -48,7 +48,7 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - appendStringInfo(buf, "lastBlockVacuumed %u", + appendStringInfo(buf, "lastBlockVacuumed %d", xlrec->lastBlockVacuumed); break; } diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 9ebf446693b..b76083323b7 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -331,8 +331,10 @@ typedef struct xl_btree_reuse_page * The WAL record can represent deletion of any number of index tuples on a * single index page when executed by VACUUM. * - * The correctness requirement for applying these changes during recovery is - * that we must do one of these two things for every block in the index: + * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber. + * For a non-MVCC index scans there is an additional correctness requirement + * for applying these changes during recovery, which is that we must do one + * of these two things for every block in the index: * * lock the block for cleanup and apply any required changes * * EnsureBlockUnpinned() * The purpose of this is to ensure that no index scans started before we