diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 4888b0b36b..dda3d03b14 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -42,7 +42,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.30 2009/01/20 18:59:37 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.31 2009/06/26 20:29:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1543,7 +1543,7 @@ CheckPointMultiXact(void) * SimpleLruTruncate would get confused. It seems best not to risk * removing any data during recovery anyway, so don't truncate. */ - if (!InRecovery) + if (!RecoveryInProgress()) TruncateMultiXact(); TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 401d805a8f..5990bae8b8 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.344 2009/06/25 21:36:00 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.345 2009/06/26 20:29:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -124,24 +124,36 @@ TimeLineID ThisTimeLineID = 0; /* * Are we doing recovery from XLOG? * - * This is only ever true in the startup process, even if the system is still - * in recovery. Prior to 8.4, all activity during recovery were carried out - * by Startup process. This local variable continues to be used in functions - * that need to act differently when called from a redo function (e.g skip - * WAL logging). To check whether the system is in recovery regardless of what + * This is only ever true in the startup process; it should be read as meaning + * "this process is replaying WAL records", rather than "the system is in + * recovery mode". It should be examined primarily by functions that need + * to act differently when called from a WAL redo function (e.g., to skip WAL + * logging). To check whether the system is in recovery regardless of which * process you're running in, use RecoveryInProgress(). */ bool InRecovery = false; -/* Are we recovering using offline XLOG archives? */ -static bool InArchiveRecovery = false; - /* * Local copy of SharedRecoveryInProgress variable. True actually means "not - * known, need to check the shared state" + * known, need to check the shared state". */ static bool LocalRecoveryInProgress = true; +/* + * Local state for XLogInsertAllowed(): + * 1: unconditionally allowed to insert XLOG + * 0: unconditionally not allowed to insert XLOG + * -1: must check RecoveryInProgress(); disallow until it is false + * Most processes start with -1 and transition to 1 after seeing that recovery + * is not in progress. But we can also force the value for special cases. + * The coding in XLogInsertAllowed() depends on the first two of these states + * being numerically the same as bool true and false. + */ +static int LocalXLogInsertAllowed = -1; + +/* Are we recovering using offline XLOG archives? */ +static bool InArchiveRecovery = false; + /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; @@ -260,7 +272,8 @@ static XLogRecPtr RedoRecPtr; * new log file. * * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures - * only one checkpointer at a time) + * only one checkpointer at a time; currently, with all checkpoints done by + * the bgwriter, this is just pro forma). * *---------- */ @@ -331,7 +344,7 @@ typedef struct XLogCtlData /* * SharedRecoveryInProgress indicates if we're still in crash or archive - * recovery. It's checked by RecoveryInProgress(). + * recovery. Protected by info_lck. */ bool SharedRecoveryInProgress; @@ -421,6 +434,7 @@ static XLogRecPtr ReadRecPtr; /* start of last record read */ static XLogRecPtr EndRecPtr; /* end+1 of last record read */ static XLogRecord *nextRecord = NULL; static TimeLineID lastPageTLI = 0; + static XLogRecPtr minRecoveryPoint; /* local copy of * ControlFile->minRecoveryPoint */ static bool updateMinRecoveryPoint = true; @@ -428,7 +442,7 @@ static bool updateMinRecoveryPoint = true; static bool InRedo = false; /* - * Flag set by interrupt handlers for later service in the redo loop. + * Flags set by interrupt handlers for later service in the redo loop. */ static volatile sig_atomic_t got_SIGHUP = false; static volatile sig_atomic_t shutdown_requested = false; @@ -537,8 +551,8 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); /* cross-check on whether we should be here or not */ - if (RecoveryInProgress()) - elog(FATAL, "cannot make new WAL entries during recovery"); + if (!XLogInsertAllowed()) + elog(ERROR, "cannot make new WAL entries during recovery"); /* info's high bits are reserved for use by me */ if (info & XLR_INFO_MASK) @@ -1780,7 +1794,7 @@ XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN) * database is consistent. * * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint - * is is only updated if it's not already greater than or equal to 'lsn'. + * is only updated if it's not already greater than or equal to 'lsn'. */ static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) @@ -1796,7 +1810,8 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) /* * An invalid minRecoveryPoint means that we need to recover all the WAL, - * ie. crash recovery. Don't update the control file in that case. + * i.e., we're doing crash recovery. We never modify the control file's + * value in that case, so we can short-circuit future checks here too. */ if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0) updateMinRecoveryPoint = false; @@ -1809,12 +1824,26 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) /* * To avoid having to update the control file too often, we update it * all the way to the last record being replayed, even though 'lsn' - * would suffice for correctness. + * would suffice for correctness. This also allows the 'force' case + * to not need a valid 'lsn' value. + * + * Another important reason for doing it this way is that the passed + * 'lsn' value could be bogus, i.e., past the end of available WAL, + * if the caller got it from a corrupted heap page. Accepting such + * a value as the min recovery point would prevent us from coming up + * at all. Instead, we just log a warning and continue with recovery. + * (See also the comments about corrupt LSNs in XLogFlush.) */ SpinLockAcquire(&xlogctl->info_lck); newMinRecoveryPoint = xlogctl->replayEndRecPtr; SpinLockRelease(&xlogctl->info_lck); + if (!force && XLByteLT(newMinRecoveryPoint, lsn)) + elog(WARNING, + "xlog min recovery request %X/%X is past current point %X/%X", + lsn.xlogid, lsn.xrecoff, + newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff); + /* update control file */ if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint)) { @@ -1843,10 +1872,13 @@ XLogFlush(XLogRecPtr record) XLogwrtRqst WriteRqst; /* - * During REDO, we don't try to flush the WAL, but update minRecoveryPoint - * instead. + * During REDO, we are reading not writing WAL. Therefore, instead of + * trying to flush the WAL, we should update minRecoveryPoint instead. + * We test XLogInsertAllowed(), not InRecovery, because we need the + * bgwriter to act this way too, and because when the bgwriter tries + * to write the end-of-recovery checkpoint, it should indeed flush. */ - if (RecoveryInProgress()) + if (!XLogInsertAllowed()) { UpdateMinRecoveryPoint(record, false); return; @@ -1935,21 +1967,20 @@ XLogFlush(XLogRecPtr record) * system's robustness rather than helping it: we do not want to take down * the whole system due to corruption on one data page. In particular, if * the bad page is encountered again during recovery then we would be - * unable to restart the database at all! (This scenario has actually - * happened in the field several times with 7.1 releases. Note that we - * cannot get here while RecoveryInProgress(), but if the bad page is - * brought in and marked dirty during recovery then if a checkpoint were - * performed at the end of recovery it will try to flush it. + * unable to restart the database at all! (This scenario actually + * happened in the field several times with 7.1 releases.) As of 8.4, + * bad LSNs encountered during recovery are UpdateMinRecoveryPoint's + * problem; the only time we can reach here during recovery is while + * flushing the end-of-recovery checkpoint record, and we don't expect + * that to have a bad LSN. * - * The current approach is to ERROR under normal conditions, but only - * WARNING during recovery, so that the system can be brought up even if - * there's a corrupt LSN. Note that for calls from xact.c, the ERROR will + * Note that for calls from xact.c, the ERROR will * be promoted to PANIC since xact.c calls this routine inside a critical * section. However, calls from bufmgr.c are not within critical sections * and so we will not force a restart for a bad LSN on a data page. */ if (XLByteLT(LogwrtResult.Flush, record)) - elog(InRecovery ? WARNING : ERROR, + elog(ERROR, "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X", record.xlogid, record.xrecoff, LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff); @@ -2751,7 +2782,7 @@ RestoreArchivedFile(char *path, const char *xlogfname, /* * Set in_restore_command to tell the signal handler that we should exit - * right away on SIGTERM. We know that we're in a safe point to do that. + * right away on SIGTERM. We know that we're at a safe point to do that. * Check if we had already received the signal, so that we don't miss a * shutdown request received just before this. */ @@ -2833,7 +2864,7 @@ RestoreArchivedFile(char *path, const char *xlogfname, * problems such as an unfindable command; treat those as fatal errors * too. */ - if (WTERMSIG(rc) == SIGTERM) + if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM) proc_exit(1); signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125; @@ -4543,6 +4574,7 @@ XLOGShmemInit(void) * in additional info.) */ XLogCtl->XLogCacheBlck = XLOGbuffers - 1; + XLogCtl->SharedRecoveryInProgress = true; XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages); SpinLockInit(&XLogCtl->info_lck); @@ -5164,8 +5196,6 @@ StartupXLOG(void) TransactionId oldestActiveXID; bool bgwriterLaunched = false; - XLogCtl->SharedRecoveryInProgress = true; - /* * Read control file and check XLOG status looks valid. * @@ -5392,7 +5422,7 @@ StartupXLOG(void) /* No need to hold ControlFileLock yet, we aren't up far enough */ UpdateControlFile(); - /* update our local copy of minRecoveryPoint */ + /* initialize our local copy of minRecoveryPoint */ minRecoveryPoint = ControlFile->minRecoveryPoint; /* @@ -5450,7 +5480,7 @@ StartupXLOG(void) /* use volatile pointer to prevent code rearrangement */ volatile XLogCtlData *xlogctl = XLogCtl; - /* Update shared replayEndRecPtr */ + /* initialize shared replayEndRecPtr */ SpinLockAcquire(&xlogctl->info_lck); xlogctl->replayEndRecPtr = ReadRecPtr; SpinLockRelease(&xlogctl->info_lck); @@ -5476,7 +5506,8 @@ StartupXLOG(void) * recovering after crash. * * After this point, we can no longer assume that we're the only - * process in addition to postmaster! + * process in addition to postmaster! Also, fsync requests are + * subsequently to be handled by the bgwriter, not locally. */ if (InArchiveRecovery && IsUnderPostmaster) { @@ -5526,11 +5557,11 @@ StartupXLOG(void) proc_exit(1); /* - * Have we reached our safe starting point? If so, we can tell + * Have we passed our safe starting point? If so, we can tell * postmaster that the database is consistent now. */ if (!reachedMinRecoveryPoint && - XLByteLE(minRecoveryPoint, EndRecPtr)) + XLByteLT(minRecoveryPoint, EndRecPtr)) { reachedMinRecoveryPoint = true; if (InArchiveRecovery) @@ -5616,7 +5647,10 @@ StartupXLOG(void) /* * Complain if we did not roll forward far enough to render the backup - * dump consistent. + * dump consistent. Note: it is indeed okay to look at the local variable + * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might + * be further ahead --- ControlFile->minRecoveryPoint cannot have been + * advanced beyond the WAL we processed. */ if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint)) { @@ -5816,14 +5850,27 @@ StartupXLOG(void) } /* - * All done. Allow backends to write WAL. + * All done. Allow backends to write WAL. (Although the bool flag is + * probably atomic in itself, we use the info_lck here to ensure that + * there are no race conditions concerning visibility of other recent + * updates to shared memory.) */ - XLogCtl->SharedRecoveryInProgress = false; + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + SpinLockAcquire(&xlogctl->info_lck); + xlogctl->SharedRecoveryInProgress = false; + SpinLockRelease(&xlogctl->info_lck); + } } /* * Is the system still in recovery? * + * Unlike testing InRecovery, this works in any process that's connected to + * shared memory. + * * As a side-effect, we initialize the local TimeLineID and RedoRecPtr * variables the first time we see that recovery is finished. */ @@ -5831,9 +5878,9 @@ bool RecoveryInProgress(void) { /* - * We check shared state each time only until we leave recovery mode. We - * can't re-enter recovery, so we rely on the local state variable after - * that. + * We check shared state each time only until we leave recovery mode. + * We can't re-enter recovery, so there's no need to keep checking after + * the shared variable has once been seen false. */ if (!LocalRecoveryInProgress) return false; @@ -5842,11 +5889,15 @@ RecoveryInProgress(void) /* use volatile pointer to prevent code rearrangement */ volatile XLogCtlData *xlogctl = XLogCtl; + /* spinlock is essential on machines with weak memory ordering! */ + SpinLockAcquire(&xlogctl->info_lck); LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress; + SpinLockRelease(&xlogctl->info_lck); /* - * Initialize TimeLineID and RedoRecPtr the first time we see that - * recovery is finished. + * Initialize TimeLineID and RedoRecPtr when we discover that recovery + * is finished. (If you change this, see also + * LocalSetXLogInsertAllowed.) */ if (!LocalRecoveryInProgress) InitXLOGAccess(); @@ -5855,6 +5906,51 @@ RecoveryInProgress(void) } } +/* + * Is this process allowed to insert new WAL records? + * + * Ordinarily this is essentially equivalent to !RecoveryInProgress(). + * But we also have provisions for forcing the result "true" or "false" + * within specific processes regardless of the global state. + */ +bool +XLogInsertAllowed(void) +{ + /* + * If value is "unconditionally true" or "unconditionally false", + * just return it. This provides the normal fast path once recovery + * is known done. + */ + if (LocalXLogInsertAllowed >= 0) + return (bool) LocalXLogInsertAllowed; + + /* + * Else, must check to see if we're still in recovery. + */ + if (RecoveryInProgress()) + return false; + + /* + * On exit from recovery, reset to "unconditionally true", since there + * is no need to keep checking. + */ + LocalXLogInsertAllowed = 1; + return true; +} + +/* + * Make XLogInsertAllowed() return true in the current process only. + */ +static void +LocalSetXLogInsertAllowed(void) +{ + Assert(LocalXLogInsertAllowed == -1); + LocalXLogInsertAllowed = 1; + + /* Initialize as RecoveryInProgress() would do when switching state */ + InitXLOGAccess(); +} + /* * Subroutine to try to fetch and validate a prior checkpoint record. * @@ -6126,7 +6222,7 @@ ShutdownXLOG(int code, Datum arg) static void LogCheckpointStart(int flags, bool restartpoint) { - char *msg; + const char *msg; /* * XXX: This is hopelessly untranslatable. We could call gettext_noop for @@ -6205,7 +6301,7 @@ LogCheckpointEnd(bool restartpoint) * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, * ignoring checkpoint_completion_target parameter. * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured - * since the last one (implied by CHECKPOINT_IS_SHUTDOWN and + * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or * CHECKPOINT_END_OF_RECOVERY). * * Note: flags contains other bits, of interest here only for logging purposes. @@ -6225,44 +6321,19 @@ CreateCheckPoint(int flags) uint32 _logSeg; TransactionId *inCommitXids; int nInCommit; - bool OldInRecovery = InRecovery; /* * An end-of-recovery checkpoint is really a shutdown checkpoint, just * issued at a different time. */ - if (flags & ((CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY) != 0)) + if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY)) shutdown = true; else shutdown = false; - /* - * A startup checkpoint is created before anyone else is allowed to - * write WAL. To allow us to write the checkpoint record, set - * LocalRecoveryInProgress to false. This lets us write WAL, but others - * are still not allowed to do so. - */ - if (flags & CHECKPOINT_END_OF_RECOVERY) - { - Assert(RecoveryInProgress()); - LocalRecoveryInProgress = false; - InitXLOGAccess(); - - /* - * Before 8.4, end-of-recovery checkpoints were always performed by - * the startup process, and InRecovery was set true. InRecovery is not - * normally set in bgwriter, but we set it here temporarily to avoid - * confusing old code in the end-of-recovery checkpoint code path that - * rely on it. - */ - InRecovery = true; - } - else - { - /* shouldn't happen */ - if (RecoveryInProgress()) - elog(ERROR, "can't create a checkpoint during recovery"); - } + /* sanity check */ + if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0) + elog(ERROR, "can't create a checkpoint during recovery"); /* * Acquire CheckpointLock to ensure only one checkpoint happens at a time. @@ -6305,7 +6376,6 @@ CreateCheckPoint(int flags) /* Begin filling in the checkpoint WAL record */ MemSet(&checkPoint, 0, sizeof(checkPoint)); - checkPoint.ThisTimeLineID = ThisTimeLineID; checkPoint.time = (pg_time_t) time(NULL); /* @@ -6472,6 +6542,20 @@ CreateCheckPoint(int flags) START_CRIT_SECTION(); + /* + * An end-of-recovery checkpoint is created before anyone is allowed to + * write WAL. To allow us to write the checkpoint record, temporarily + * enable XLogInsertAllowed. + */ + if (flags & CHECKPOINT_END_OF_RECOVERY) + LocalSetXLogInsertAllowed(); + + /* + * This needs to be done after LocalSetXLogInsertAllowed(), else + * ThisTimeLineID might still be uninitialized. + */ + checkPoint.ThisTimeLineID = ThisTimeLineID; + /* * Now insert the checkpoint record into XLOG. */ @@ -6487,6 +6571,21 @@ CreateCheckPoint(int flags) XLogFlush(recptr); + /* + * We mustn't write any new WAL after a shutdown checkpoint, or it will + * be overwritten at next startup. No-one should even try, this just + * allows sanity-checking. In the case of an end-of-recovery checkpoint, + * we want to just temporarily disable writing until the system has exited + * recovery. + */ + if (shutdown) + { + if (flags & CHECKPOINT_END_OF_RECOVERY) + LocalXLogInsertAllowed = -1; /* return to "check" state */ + else + LocalXLogInsertAllowed = 0; /* never again write WAL */ + } + /* * We now have ProcLastRecPtr = start of actual checkpoint record, recptr * = end of actual checkpoint record. @@ -6560,7 +6659,7 @@ CreateCheckPoint(int flags) * in subtrans.c). During recovery, though, we mustn't do this because * StartupSUBTRANS hasn't been called yet. */ - if (!InRecovery) + if (!RecoveryInProgress()) TruncateSUBTRANS(GetOldestXmin(true, false)); /* All real work is done, but log before releasing lock. */ @@ -6574,9 +6673,6 @@ CreateCheckPoint(int flags) CheckpointStats.ckpt_segs_recycled); LWLockRelease(CheckpointLock); - - /* Restore old value */ - InRecovery = OldInRecovery; } /* @@ -6597,10 +6693,14 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) } /* - * This is used during WAL recovery to establish a point from which recovery - * can roll forward without replaying the entire recovery log. This function - * is called each time a checkpoint record is read from XLOG. It is stored - * in shared memory, so that it can be used as a restartpoint later on. + * Save a checkpoint for recovery restart if appropriate + * + * This function is called each time a checkpoint record is read from XLOG. + * It must determine whether the checkpoint represents a safe restartpoint or + * not. If so, the checkpoint record is stashed in shared memory so that + * CreateRestartPoint can consult it. (Note that the latter function is + * executed by the bgwriter, while this one will be executed by the startup + * process.) */ static void RecoveryRestartPoint(const CheckPoint *checkPoint) @@ -6640,12 +6740,14 @@ RecoveryRestartPoint(const CheckPoint *checkPoint) } /* + * Establish a restartpoint if possible. + * * This is similar to CreateCheckPoint, but is used during WAL recovery * to establish a point from which recovery can roll forward without * replaying the entire recovery log. * * Returns true if a new restartpoint was established. We can only establish - * a restartpoint if we have replayed a checkpoint record since last + * a restartpoint if we have replayed a safe checkpoint record since last * restartpoint. */ bool @@ -6663,7 +6765,7 @@ CreateRestartPoint(int flags) */ LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); - /* Get the a local copy of the last checkpoint record. */ + /* Get a local copy of the last safe checkpoint record. */ SpinLockAcquire(&xlogctl->info_lck); lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr; memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint)); @@ -6723,14 +6825,21 @@ CreateRestartPoint(int flags) CheckPointGuts(lastCheckPoint.redo, flags); /* - * Update pg_control, using current time + * Update pg_control, using current time. Check that it still shows + * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing; + * this is a quick hack to make sure nothing really bad happens if + * somehow we get here after the end-of-recovery checkpoint. */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - ControlFile->prevCheckPoint = ControlFile->checkPoint; - ControlFile->checkPoint = lastCheckPointRecPtr; - ControlFile->checkPointCopy = lastCheckPoint; - ControlFile->time = (pg_time_t) time(NULL); - UpdateControlFile(); + if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY && + XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo)) + { + ControlFile->prevCheckPoint = ControlFile->checkPoint; + ControlFile->checkPoint = lastCheckPointRecPtr; + ControlFile->checkPointCopy = lastCheckPoint; + ControlFile->time = (pg_time_t) time(NULL); + UpdateControlFile(); + } LWLockRelease(ControlFileLock); /* @@ -6747,6 +6856,7 @@ CreateRestartPoint(int flags) (errmsg("recovery restart point at %X/%X", lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff))); + /* XXX this is currently BROKEN because we are in the wrong process */ if (recoveryLastXTime) ereport((log_checkpoints ? LOG : DEBUG2), (errmsg("last completed transaction was at log time %s", @@ -6821,7 +6931,7 @@ RequestXLogSwitch(void) * XLOG resource manager's routines * * Definitions of info values are in include/catalog/pg_control.h, though - * not all records types are related to control file processing. + * not all record types are related to control file updates. */ void xlog_redo(XLogRecPtr lsn, XLogRecord *record) diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index b5fd31532e..831ea9478a 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -19,7 +19,8 @@ * condition.) * * The bgwriter is started by the postmaster as soon as the startup subprocess - * finishes. It remains alive until the postmaster commands it to terminate. + * finishes, or as soon as recovery begins if we are doing archive recovery. + * It remains alive until the postmaster commands it to terminate. * Normal termination is by SIGUSR2, which instructs the bgwriter to execute * a shutdown checkpoint and then exit(0). (All backends must be stopped * before SIGUSR2 is issued!) Emergency termination is by SIGQUIT; like any @@ -37,7 +38,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.61 2009/06/25 21:36:00 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.62 2009/06/26 20:29:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -902,11 +903,11 @@ BgWriterShmemInit(void) * * flags is a bitwise OR of the following: * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. - * CHECKPOINT_END_OF_RECOVERY: checkpoint is to finish WAL recovery. + * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, * ignoring checkpoint_completion_target parameter. * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured - * since the last one (implied by CHECKPOINT_IS_SHUTDOWN and + * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or * CHECKPOINT_END_OF_RECOVERY). * CHECKPOINT_WAIT: wait for completion before returning (otherwise, * just signal bgwriter to do it, and return). diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index c9b0e0ab2e..3dbf36a6cf 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -37,7 +37,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.582 2009/06/11 14:49:01 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.583 2009/06/26 20:29:04 tgl Exp $ * * NOTES * @@ -227,21 +227,22 @@ static bool RecoveryError = false; /* T if WAL recovery failed */ * * After doing all the postmaster initialization work, we enter PM_STARTUP * state and the startup process is launched. The startup process begins by - * reading the control file and other preliminary initialization steps. When - * it's ready to start WAL redo, it signals postmaster, and we switch to - * PM_RECOVERY phase. The background writer is launched, while the startup - * process continues applying WAL. + * reading the control file and other preliminary initialization steps. + * In a normal startup, or after crash recovery, the startup process exits + * with exit code 0 and we switch to PM_RUN state. However, archive recovery + * is handled specially since it takes much longer and we would like to support + * hot standby during archive recovery. * + * When the startup process is ready to start archive recovery, it signals the + * postmaster, and we switch to PM_RECOVERY state. The background writer is + * launched, while the startup process continues applying WAL. * After reaching a consistent point in WAL redo, startup process signals - * us again, and we switch to PM_RECOVERY_CONSISTENT phase. There's currently + * us again, and we switch to PM_RECOVERY_CONSISTENT state. There's currently * no difference between PM_RECOVERY and PM_RECOVERY_CONSISTENT, but we * could start accepting connections to perform read-only queries at this * point, if we had the infrastructure to do that. - * - * When WAL redo is finished, the startup process exits with exit code 0 - * and we switch to PM_RUN state. Startup process can also skip the - * recovery and consistent recovery phases altogether, as it will during - * normal startup when there's no recovery to be done, for example. + * When archive recovery is finished, the startup process exits with exit + * code 0 and we switch to PM_RUN state. * * Normal child backends can only be launched when we are in PM_RUN state. * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.) @@ -269,7 +270,7 @@ typedef enum { PM_INIT, /* postmaster starting */ PM_STARTUP, /* waiting for startup subprocess */ - PM_RECOVERY, /* in recovery mode */ + PM_RECOVERY, /* in archive recovery mode */ PM_RECOVERY_CONSISTENT, /* consistent recovery mode */ PM_RUN, /* normal "database is alive" state */ PM_WAIT_BACKUP, /* waiting for online backup mode to end */ @@ -2195,8 +2196,8 @@ reaper(SIGNAL_ARGS) /* * Unexpected exit of startup process (including FATAL exit) - * during PM_STARTUP is treated as catastrophic. There is no other - * processes running yet, so we can just exit. + * during PM_STARTUP is treated as catastrophic. There are no + * other processes running yet, so we can just exit. */ if (pmState == PM_STARTUP && !EXIT_STATUS_0(exitstatus)) { @@ -2247,7 +2248,7 @@ reaper(SIGNAL_ARGS) /* * Crank up the background writer, if we didn't do that already - * when we entered consistent recovery phase. It doesn't matter + * when we entered consistent recovery state. It doesn't matter * if this fails, we'll just try again later. */ if (BgWriterPID == 0) @@ -4008,7 +4009,7 @@ sigusr1_handler(SIGNAL_ARGS) /* * Load the flat authorization file into postmaster's cache. The * startup process won't have recomputed this from the database yet, - * so we it may change following recovery. + * so it may change following recovery. */ load_role(); diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 18402a6ad6..0c4861d6db 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.147 2009/06/25 21:36:00 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.148 2009/06/26 20:29:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -204,10 +204,10 @@ mdinit(void) } /* - * In archive recovery, we rely on bgwriter to do fsyncs(), but we don't - * know that we do archive recovery at process startup when pendingOpsTable - * has already been created. Calling this function drops pendingOpsTable - * and causes any subsequent requests to be forwarded to bgwriter. + * In archive recovery, we rely on bgwriter to do fsyncs, but we will have + * already created the pendingOpsTable during initialization of the startup + * process. Calling this function drops the local pendingOpsTable so that + * subsequent requests will be forwarded to bgwriter. */ void SetForwardFsyncRequests(void) diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index ea9e232a08..052a314d74 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.92 2009/06/25 21:36:00 heikki Exp $ + * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.93 2009/06/26 20:29:04 tgl Exp $ */ #ifndef XLOG_H #define XLOG_H @@ -159,15 +159,15 @@ extern bool XLOG_DEBUG; /* These directly affect the behavior of CreateCheckPoint and subsidiaries */ #define CHECKPOINT_IS_SHUTDOWN 0x0001 /* Checkpoint is for shutdown */ -#define CHECKPOINT_IMMEDIATE 0x0002 /* Do it without delays */ -#define CHECKPOINT_FORCE 0x0004 /* Force even if no activity */ -/* These are important to RequestCheckpoint */ -#define CHECKPOINT_WAIT 0x0008 /* Wait for completion */ -/* These indicate the cause of a checkpoint request */ -#define CHECKPOINT_CAUSE_XLOG 0x0010 /* XLOG consumption */ -#define CHECKPOINT_CAUSE_TIME 0x0020 /* Elapsed time */ -#define CHECKPOINT_END_OF_RECOVERY 0x0040 /* Like shutdown checkpoint, but +#define CHECKPOINT_END_OF_RECOVERY 0x0002 /* Like shutdown checkpoint, but * issued at end of WAL recovery */ +#define CHECKPOINT_IMMEDIATE 0x0004 /* Do it without delays */ +#define CHECKPOINT_FORCE 0x0008 /* Force even if no activity */ +/* These are important to RequestCheckpoint */ +#define CHECKPOINT_WAIT 0x0010 /* Wait for completion */ +/* These indicate the cause of a checkpoint request */ +#define CHECKPOINT_CAUSE_XLOG 0x0020 /* XLOG consumption */ +#define CHECKPOINT_CAUSE_TIME 0x0040 /* Elapsed time */ /* Checkpoint statistics */ typedef struct CheckpointStatsData @@ -202,6 +202,7 @@ extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec); extern bool RecoveryInProgress(void); +extern bool XLogInsertAllowed(void); extern void UpdateControlFile(void); extern Size XLOGShmemSize(void);