From 361bd1662eb1edbc24a9e26dd413e5f38c68fcdf Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 13 Apr 2010 14:17:46 +0000 Subject: [PATCH] Allow Hot Standby to begin from a shutdown checkpoint. Patch by Simon Riggs & me --- src/backend/access/transam/twophase.c | 85 ++++++++++- src/backend/access/transam/xlog.c | 198 ++++++++++++++++++-------- src/include/access/twophase.h | 3 +- 3 files changed, 224 insertions(+), 62 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index b1bf2c4f26..faafc7e5c1 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.59 2010/02/26 02:00:34 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.60 2010/04/13 14:17:46 heikki Exp $ * * NOTES * Each global transaction is associated with a global transaction @@ -1718,6 +1718,89 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) return result; } +/* + * StandbyRecoverPreparedTransactions + * + * Scan the pg_twophase directory and setup all the required information to + * allow standby queries to treat prepared transactions as still active. + * This is never called at the end of recovery - we use + * RecoverPreparedTransactions() at that point. + * + * Currently we simply call SubTransSetParent() for any subxids of prepared + * transactions. If overwriteOK is true, it's OK if some XIDs have already + * been marked in pg_subtrans. + */ +void +StandbyRecoverPreparedTransactions(bool overwriteOK) +{ + DIR *cldir; + struct dirent *clde; + + cldir = AllocateDir(TWOPHASE_DIR); + while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL) + { + if (strlen(clde->d_name) == 8 && + strspn(clde->d_name, "0123456789ABCDEF") == 8) + { + TransactionId xid; + char *buf; + TwoPhaseFileHeader *hdr; + TransactionId *subxids; + int i; + + xid = (TransactionId) strtoul(clde->d_name, NULL, 16); + + /* Already processed? */ + if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) + { + ereport(WARNING, + (errmsg("removing stale two-phase state file \"%s\"", + clde->d_name))); + RemoveTwoPhaseFile(xid, true); + continue; + } + + /* Read and validate file */ + buf = ReadTwoPhaseFile(xid, true); + if (buf == NULL) + { + ereport(WARNING, + (errmsg("removing corrupt two-phase state file \"%s\"", + clde->d_name))); + RemoveTwoPhaseFile(xid, true); + continue; + } + + /* Deconstruct header */ + hdr = (TwoPhaseFileHeader *) buf; + if (!TransactionIdEquals(hdr->xid, xid)) + { + ereport(WARNING, + (errmsg("removing corrupt two-phase state file \"%s\"", + clde->d_name))); + RemoveTwoPhaseFile(xid, true); + pfree(buf); + continue; + } + + /* + * Examine subtransaction XIDs ... they should all follow main + * XID. + */ + subxids = (TransactionId *) + (buf + MAXALIGN(sizeof(TwoPhaseFileHeader))); + for (i = 0; i < hdr->nsubxacts; i++) + { + TransactionId subxid = subxids[i]; + + Assert(TransactionIdFollows(subxid, xid)); + SubTransSetParent(xid, subxid, overwriteOK); + } + } + } + FreeDir(cldir); +} + /* * RecoverPreparedTransactions * diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 379c6f1175..5fd4b870be 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.393 2010/04/12 10:40:42 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.394 2010/04/13 14:17:46 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -496,6 +496,7 @@ static TimeLineID lastPageTLI = 0; static XLogRecPtr minRecoveryPoint; /* local copy of * ControlFile->minRecoveryPoint */ static bool updateMinRecoveryPoint = true; +static bool reachedMinRecoveryPoint = false; static bool InRedo = false; @@ -551,6 +552,7 @@ static void ValidateXLOGDirectoryStructure(void); static void CleanupBackupHistory(void); static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force); static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt); +static void CheckRecoveryConsistency(void); static bool ValidXLOGHeader(XLogPageHeader hdr, int emode); static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt); static List *readTimeLineHistory(TimeLineID targetTLI); @@ -5591,7 +5593,6 @@ StartupXLOG(void) uint32 freespace; TransactionId oldestActiveXID; bool bgwriterLaunched = false; - bool backendsAllowed = false; /* * Read control file and check XLOG status looks valid. @@ -5838,6 +5839,8 @@ StartupXLOG(void) if (InRecovery) { int rmid; + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; /* * Update pg_control to show that we are recovering and to show the @@ -5930,6 +5933,33 @@ StartupXLOG(void) StartupMultiXact(); ProcArrayInitRecoveryInfo(oldestActiveXID); + + /* + * If we're beginning at a shutdown checkpoint, we know that + * nothing was running on the master at this point. So fake-up + * an empty running-xacts record and use that here and now. + * Recover additional standby state for prepared transactions. + */ + if (wasShutdown) + { + RunningTransactionsData running; + + /* + * Construct a RunningTransactions snapshot representing a shut + * down server, with only prepared transactions still alive. + * We're never overflowed at this point because all subxids + * are listed with their parent prepared transactions. + */ + running.xcnt = nxids; + running.subxid_overflow = false; + running.nextXid = checkPoint.nextXid; + running.oldestRunningXid = oldestActiveXID; + running.xids = xids; + + ProcArrayApplyRecoveryInfo(&running); + + StandbyRecoverPreparedTransactions(false); + } } /* Initialize resource managers */ @@ -5939,6 +5969,46 @@ StartupXLOG(void) RmgrTable[rmid].rm_startup(); } + /* + * Initialize shared replayEndRecPtr and recoveryLastRecPtr. + * + * This is slightly confusing if we're starting from an online + * checkpoint; we've just read and replayed the chekpoint record, + * but we're going to start replay from its redo pointer, which + * precedes the location of the checkpoint record itself. So even + * though the last record we've replayed is indeed ReadRecPtr, we + * haven't replayed all the preceding records yet. That's OK for + * the current use of these variables. + */ + SpinLockAcquire(&xlogctl->info_lck); + xlogctl->replayEndRecPtr = ReadRecPtr; + xlogctl->recoveryLastRecPtr = ReadRecPtr; + SpinLockRelease(&xlogctl->info_lck); + + /* + * Let postmaster know we've started redo now, so that it can + * launch bgwriter to perform restartpoints. We don't bother + * during crash recovery as restartpoints can only be performed + * during archive recovery. And we'd like to keep crash recovery + * simple, to avoid introducing bugs that could you from + * recovering after crash. + * + * After this point, we can no longer assume that we're the only + * process in addition to postmaster! Also, fsync requests are + * subsequently to be handled by the bgwriter, not locally. + */ + if (InArchiveRecovery && IsUnderPostmaster) + { + SetForwardFsyncRequests(); + SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); + bgwriterLaunched = true; + } + + /* + * Allow read-only connections immediately if we're consistent already. + */ + CheckRecoveryConsistency(); + /* * Find the first record that logically follows the checkpoint --- it * might physically precede it, though. @@ -5958,43 +6028,14 @@ StartupXLOG(void) { bool recoveryContinue = true; bool recoveryApply = true; - bool reachedMinRecoveryPoint = false; ErrorContextCallback errcontext; - /* use volatile pointer to prevent code rearrangement */ - volatile XLogCtlData *xlogctl = XLogCtl; - - /* initialize shared replayEndRecPtr and recoveryLastRecPtr */ - SpinLockAcquire(&xlogctl->info_lck); - xlogctl->replayEndRecPtr = ReadRecPtr; - xlogctl->recoveryLastRecPtr = ReadRecPtr; - SpinLockRelease(&xlogctl->info_lck); - InRedo = true; ereport(LOG, (errmsg("redo starts at %X/%X", ReadRecPtr.xlogid, ReadRecPtr.xrecoff))); - /* - * Let postmaster know we've started redo now, so that it can - * launch bgwriter to perform restartpoints. We don't bother - * during crash recovery as restartpoints can only be performed - * during archive recovery. And we'd like to keep crash recovery - * simple, to avoid introducing bugs that could you from - * recovering after crash. - * - * After this point, we can no longer assume that we're the only - * process in addition to postmaster! Also, fsync requests are - * subsequently to be handled by the bgwriter, not locally. - */ - if (InArchiveRecovery && IsUnderPostmaster) - { - SetForwardFsyncRequests(); - SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); - bgwriterLaunched = true; - } - /* * main redo apply loop */ @@ -6024,32 +6065,8 @@ StartupXLOG(void) /* Handle interrupt signals of startup process */ HandleStartupProcInterrupts(); - /* - * Have we passed our safe starting point? - */ - if (!reachedMinRecoveryPoint && - XLByteLE(minRecoveryPoint, EndRecPtr) && - XLogRecPtrIsInvalid(ControlFile->backupStartPoint)) - { - reachedMinRecoveryPoint = true; - ereport(LOG, - (errmsg("consistent recovery state reached at %X/%X", - EndRecPtr.xlogid, EndRecPtr.xrecoff))); - } - - /* - * Have we got a valid starting snapshot that will allow - * queries to be run? If so, we can tell postmaster that the - * database is consistent now, enabling connections. - */ - if (standbyState == STANDBY_SNAPSHOT_READY && - !backendsAllowed && - reachedMinRecoveryPoint && - IsUnderPostmaster) - { - backendsAllowed = true; - SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT); - } + /* Allow read-only connections if we're consistent now */ + CheckRecoveryConsistency(); /* * Have we reached our recovery target? @@ -6398,6 +6415,44 @@ StartupXLOG(void) } } +/* + * Checks if recovery has reached a consistent state. When consistency is + * reached and we have a valid starting standby snapshot, tell postmaster + * that it can start accepting read-only connections. + */ +static void +CheckRecoveryConsistency(void) +{ + static bool backendsAllowed = false; + + /* + * Have we passed our safe starting point? + */ + if (!reachedMinRecoveryPoint && + XLByteLE(minRecoveryPoint, EndRecPtr) && + XLogRecPtrIsInvalid(ControlFile->backupStartPoint)) + { + reachedMinRecoveryPoint = true; + ereport(LOG, + (errmsg("consistent recovery state reached at %X/%X", + EndRecPtr.xlogid, EndRecPtr.xrecoff))); + } + + /* + * Have we got a valid starting snapshot that will allow + * queries to be run? If so, we can tell postmaster that the + * database is consistent now, enabling connections. + */ + if (standbyState == STANDBY_SNAPSHOT_READY && + !backendsAllowed && + reachedMinRecoveryPoint && + IsUnderPostmaster) + { + backendsAllowed = true; + SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT); + } +} + /* * Is the system still in recovery? * @@ -7657,13 +7712,36 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) if (standbyState != STANDBY_DISABLED) CheckRequiredParameterValues(checkPoint); + /* + * If we see a shutdown checkpoint, we know that nothing was + * running on the master at this point. So fake-up an empty + * running-xacts record and use that here and now. Recover + * additional standby state for prepared transactions. + */ if (standbyState >= STANDBY_INITIALIZED) { + TransactionId *xids; + int nxids; + TransactionId oldestActiveXID; + RunningTransactionsData running; + + oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids); + /* - * Remove stale transactions, if any. + * Construct a RunningTransactions snapshot representing a shut + * down server, with only prepared transactions still alive. + * We're never overflowed at this point because all subxids + * are listed with their parent prepared transactions. */ - ExpireOldKnownAssignedTransactionIds(checkPoint.nextXid); - StandbyReleaseOldLocks(checkPoint.nextXid); + running.xcnt = nxids; + running.subxid_overflow = false; + running.nextXid = checkPoint.nextXid; + running.oldestRunningXid = oldestActiveXID; + running.xids = xids; + + ProcArrayApplyRecoveryInfo(&running); + + StandbyRecoverPreparedTransactions(true); } /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 61b92244fb..ea3c9966c7 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.14 2010/01/02 16:58:00 momjian Exp $ + * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.15 2010/04/13 14:17:46 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -44,6 +44,7 @@ extern bool StandbyTransactionIdIsPrepared(TransactionId xid); extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p); +extern void StandbyRecoverPreparedTransactions(bool overwriteOK); extern void RecoverPreparedTransactions(void); extern void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);