mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-12-15 08:20:16 +08:00
Fast promote mode skips checkpoint at end of recovery.
pg_ctl promote -m fast will skip the checkpoint at end of recovery so that we can achieve very fast failover when the apply delay is low. Write new WAL record XLOG_END_OF_RECOVERY to allow us to switch timeline correctly for downstream log readers. If we skip synchronous end of recovery checkpoint we request a normal spread checkpoint so that the window of re-recovery is low. Simon Riggs and Kyotaro Horiguchi, with input from Fujii Masao. Review by Heikki Linnakangas
This commit is contained in:
parent
ee22c55f5a
commit
fd4ced5230
@ -18,6 +18,7 @@
|
||||
#include "access/xlog_internal.h"
|
||||
#include "catalog/pg_control.h"
|
||||
#include "utils/guc.h"
|
||||
#include "utils/timestamp.h"
|
||||
|
||||
/*
|
||||
* GUC support
|
||||
@ -119,6 +120,15 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
|
||||
memcpy(&fpw, rec, sizeof(bool));
|
||||
appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false");
|
||||
}
|
||||
else if (info == XLOG_END_OF_RECOVERY)
|
||||
{
|
||||
xl_end_of_recovery xlrec;
|
||||
|
||||
memcpy(&xlrec, rec, sizeof(xl_end_of_recovery));
|
||||
appendStringInfo(buf, "end_of_recovery: tli %u; time %s",
|
||||
xlrec.ThisTimeLineID,
|
||||
timestamptz_to_str(xlrec.end_time));
|
||||
}
|
||||
else
|
||||
appendStringInfo(buf, "UNKNOWN");
|
||||
}
|
||||
|
@ -66,6 +66,7 @@
|
||||
#define RECOVERY_COMMAND_FILE "recovery.conf"
|
||||
#define RECOVERY_COMMAND_DONE "recovery.done"
|
||||
#define PROMOTE_SIGNAL_FILE "promote"
|
||||
#define FAST_PROMOTE_SIGNAL_FILE "fast_promote"
|
||||
|
||||
|
||||
/* User-settable parameters */
|
||||
@ -210,6 +211,9 @@ bool StandbyMode = false;
|
||||
static char *PrimaryConnInfo = NULL;
|
||||
static char *TriggerFile = NULL;
|
||||
|
||||
/* whether request for fast promotion has been made yet */
|
||||
static bool fast_promote = false;
|
||||
|
||||
/* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
|
||||
static TransactionId recoveryStopXid;
|
||||
static TimestampTz recoveryStopTime;
|
||||
@ -611,6 +615,7 @@ static void CheckRequiredParameterValues(void);
|
||||
static void XLogReportParameters(void);
|
||||
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI);
|
||||
static void LocalSetXLogInsertAllowed(void);
|
||||
static void CreateEndOfRecoveryRecord(void);
|
||||
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
|
||||
static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
|
||||
|
||||
@ -642,7 +647,7 @@ static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
|
||||
int emode, bool fetching_ckpt);
|
||||
static void CheckRecoveryConsistency(void);
|
||||
static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
|
||||
XLogRecPtr RecPtr, int whichChkpt);
|
||||
XLogRecPtr RecPtr, int whichChkpti, bool report);
|
||||
static bool rescanLatestTimeLine(void);
|
||||
static void WriteControlFile(void);
|
||||
static void ReadControlFile(void);
|
||||
@ -4848,7 +4853,7 @@ StartupXLOG(void)
|
||||
* When a backup_label file is present, we want to roll forward from
|
||||
* the checkpoint it identifies, rather than using pg_control.
|
||||
*/
|
||||
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0);
|
||||
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
|
||||
if (record != NULL)
|
||||
{
|
||||
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
|
||||
@ -4890,7 +4895,7 @@ StartupXLOG(void)
|
||||
*/
|
||||
checkPointLoc = ControlFile->checkPoint;
|
||||
RedoStartLSN = ControlFile->checkPointCopy.redo;
|
||||
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1);
|
||||
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
|
||||
if (record != NULL)
|
||||
{
|
||||
ereport(DEBUG1,
|
||||
@ -4909,7 +4914,7 @@ StartupXLOG(void)
|
||||
else
|
||||
{
|
||||
checkPointLoc = ControlFile->prevCheckPoint;
|
||||
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2);
|
||||
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
|
||||
if (record != NULL)
|
||||
{
|
||||
ereport(LOG,
|
||||
@ -5393,22 +5398,33 @@ StartupXLOG(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Before replaying this record, check if it is a shutdown
|
||||
* checkpoint record that causes the current timeline to
|
||||
* change. The checkpoint record is already considered to be
|
||||
* part of the new timeline, so we update ThisTimeLineID
|
||||
* before replaying it. That's important so that replayEndTLI,
|
||||
* which is recorded as the minimum recovery point's TLI if
|
||||
* Before replaying this record, check if this record
|
||||
* causes the current timeline to change. The record is
|
||||
* already considered to be part of the new timeline,
|
||||
* so we update ThisTimeLineID before replaying it.
|
||||
* That's important so that replayEndTLI, which is
|
||||
* recorded as the minimum recovery point's TLI if
|
||||
* recovery stops after this record, is set correctly.
|
||||
*/
|
||||
if (record->xl_rmid == RM_XLOG_ID &&
|
||||
(record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN)
|
||||
if (record->xl_rmid == RM_XLOG_ID)
|
||||
{
|
||||
CheckPoint checkPoint;
|
||||
TimeLineID newTLI;
|
||||
TimeLineID newTLI = ThisTimeLineID;
|
||||
uint8 info = record->xl_info & ~XLR_INFO_MASK;
|
||||
|
||||
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
|
||||
newTLI = checkPoint.ThisTimeLineID;
|
||||
if (info == XLOG_CHECKPOINT_SHUTDOWN)
|
||||
{
|
||||
CheckPoint checkPoint;
|
||||
|
||||
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
|
||||
newTLI = checkPoint.ThisTimeLineID;
|
||||
}
|
||||
else if (info == XLOG_END_OF_RECOVERY)
|
||||
{
|
||||
xl_end_of_recovery xlrec;
|
||||
|
||||
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
|
||||
newTLI = xlrec.ThisTimeLineID;
|
||||
}
|
||||
|
||||
if (newTLI != ThisTimeLineID)
|
||||
{
|
||||
@ -5729,9 +5745,36 @@ StartupXLOG(void)
|
||||
* allows some extra error checking in xlog_redo.
|
||||
*/
|
||||
if (bgwriterLaunched)
|
||||
RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
|
||||
CHECKPOINT_IMMEDIATE |
|
||||
CHECKPOINT_WAIT);
|
||||
{
|
||||
bool checkpoint_wait = true;
|
||||
|
||||
/*
|
||||
* If we've been explicitly promoted with fast option,
|
||||
* end of recovery without a checkpoint if possible.
|
||||
*/
|
||||
if (fast_promote)
|
||||
{
|
||||
checkPointLoc = ControlFile->prevCheckPoint;
|
||||
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, false);
|
||||
if (record != NULL)
|
||||
{
|
||||
checkpoint_wait = false;
|
||||
CreateEndOfRecoveryRecord();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* In most cases we will wait for a full checkpoint to complete.
|
||||
*
|
||||
* If not, issue a normal, non-immediate checkpoint but don't wait.
|
||||
*/
|
||||
if (checkpoint_wait)
|
||||
RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
|
||||
CHECKPOINT_IMMEDIATE |
|
||||
CHECKPOINT_WAIT);
|
||||
else
|
||||
RequestCheckpoint(0); /* No flags */
|
||||
}
|
||||
else
|
||||
CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
|
||||
|
||||
@ -6060,12 +6103,15 @@ LocalSetXLogInsertAllowed(void)
|
||||
*/
|
||||
static XLogRecord *
|
||||
ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
|
||||
int whichChkpt)
|
||||
int whichChkpt, bool report)
|
||||
{
|
||||
XLogRecord *record;
|
||||
|
||||
if (!XRecOffIsValid(RecPtr))
|
||||
{
|
||||
if (!report)
|
||||
return NULL;
|
||||
|
||||
switch (whichChkpt)
|
||||
{
|
||||
case 1:
|
||||
@ -6088,6 +6134,9 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
|
||||
|
||||
if (record == NULL)
|
||||
{
|
||||
if (!report)
|
||||
return NULL;
|
||||
|
||||
switch (whichChkpt)
|
||||
{
|
||||
case 1:
|
||||
@ -6882,6 +6931,44 @@ CreateCheckPoint(int flags)
|
||||
LWLockRelease(CheckpointLock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark the end of recovery in WAL though without running a full checkpoint.
|
||||
* We can expect that a restartpoint is likely to be in progress as we
|
||||
* do this, though we are unwilling to wait for it to complete. So be
|
||||
* careful to avoid taking the CheckpointLock anywhere here.
|
||||
*
|
||||
* CreateRestartPoint() allows for the case where recovery may end before
|
||||
* the restartpoint completes so there is no concern of concurrent behaviour.
|
||||
*/
|
||||
void
|
||||
CreateEndOfRecoveryRecord(void)
|
||||
{
|
||||
xl_end_of_recovery xlrec;
|
||||
XLogRecData rdata;
|
||||
|
||||
/* sanity check */
|
||||
if (!RecoveryInProgress())
|
||||
elog(ERROR, "can only be used to end recovery");
|
||||
|
||||
xlrec.end_time = time(NULL);
|
||||
xlrec.ThisTimeLineID = ThisTimeLineID;
|
||||
|
||||
LocalSetXLogInsertAllowed();
|
||||
|
||||
START_CRIT_SECTION();
|
||||
|
||||
rdata.data = (char *) &xlrec;
|
||||
rdata.len = sizeof(xl_end_of_recovery);
|
||||
rdata.buffer = InvalidBuffer;
|
||||
rdata.next = NULL;
|
||||
|
||||
(void) XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
|
||||
|
||||
END_CRIT_SECTION();
|
||||
|
||||
LocalXLogInsertAllowed = -1; /* return to "check" state */
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush all data in shared memory to disk, and fsync
|
||||
*
|
||||
@ -7613,6 +7700,27 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
|
||||
|
||||
RecoveryRestartPoint(&checkPoint);
|
||||
}
|
||||
else if (info == XLOG_END_OF_RECOVERY)
|
||||
{
|
||||
xl_end_of_recovery xlrec;
|
||||
|
||||
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
|
||||
|
||||
/*
|
||||
* For Hot Standby, we could treat this like a Shutdown Checkpoint,
|
||||
* but this case is rarer and harder to test, so the benefit doesn't
|
||||
* outweigh the potential extra cost of maintenance.
|
||||
*/
|
||||
|
||||
/*
|
||||
* We should've already switched to the new TLI before replaying this
|
||||
* record.
|
||||
*/
|
||||
if (xlrec.ThisTimeLineID != ThisTimeLineID)
|
||||
ereport(PANIC,
|
||||
(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
|
||||
xlrec.ThisTimeLineID, ThisTimeLineID)));
|
||||
}
|
||||
else if (info == XLOG_NOOP)
|
||||
{
|
||||
/* nothing to do here */
|
||||
@ -9405,8 +9513,39 @@ CheckForStandbyTrigger(void)
|
||||
|
||||
if (IsPromoteTriggered())
|
||||
{
|
||||
ereport(LOG,
|
||||
/*
|
||||
* In 9.1 and 9.2 the postmaster unlinked the promote file
|
||||
* inside the signal handler. We now leave the file in place
|
||||
* and let the Startup process do the unlink. This allows
|
||||
* Startup to know whether we're doing fast or normal
|
||||
* promotion. Fast promotion takes precedence.
|
||||
*/
|
||||
if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
|
||||
{
|
||||
unlink(FAST_PROMOTE_SIGNAL_FILE);
|
||||
unlink(PROMOTE_SIGNAL_FILE);
|
||||
fast_promote = true;
|
||||
}
|
||||
else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
|
||||
{
|
||||
unlink(PROMOTE_SIGNAL_FILE);
|
||||
fast_promote = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* We only look for fast promote via the pg_ctl promote option.
|
||||
* It would be possible to extend trigger file support for the
|
||||
* fast promotion option but that wouldn't be backwards compatible
|
||||
* anyway and we're looking to focus further work on the promote
|
||||
* option as the right way to signal end of recovery.
|
||||
*/
|
||||
if (fast_promote)
|
||||
ereport(LOG,
|
||||
(errmsg("received fast promote request")));
|
||||
else
|
||||
ereport(LOG,
|
||||
(errmsg("received promote request")));
|
||||
|
||||
ResetPromoteTriggered();
|
||||
triggered = true;
|
||||
return true;
|
||||
@ -9435,15 +9574,10 @@ CheckPromoteSignal(void)
|
||||
{
|
||||
struct stat stat_buf;
|
||||
|
||||
if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
|
||||
{
|
||||
/*
|
||||
* Since we are in a signal handler, it's not safe to elog. We
|
||||
* silently ignore any error from unlink.
|
||||
*/
|
||||
unlink(PROMOTE_SIGNAL_FILE);
|
||||
if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
|
||||
stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1136,6 +1136,15 @@ do_promote(void)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Use two different kinds of promotion file so we can understand
|
||||
* the difference between smart and fast promotion.
|
||||
*/
|
||||
if (shutdown_mode >= FAST_MODE)
|
||||
snprintf(promote_file, MAXPGPATH, "%s/fast_promote", pg_data);
|
||||
else
|
||||
snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
|
||||
|
||||
if ((prmfile = fopen(promote_file, "w")) == NULL)
|
||||
{
|
||||
write_stderr(_("%s: could not create promote signal file \"%s\": %s\n"),
|
||||
@ -1799,7 +1808,7 @@ do_help(void)
|
||||
" [-o \"OPTIONS\"]\n"), progname);
|
||||
printf(_(" %s reload [-D DATADIR] [-s]\n"), progname);
|
||||
printf(_(" %s status [-D DATADIR]\n"), progname);
|
||||
printf(_(" %s promote [-D DATADIR] [-s]\n"), progname);
|
||||
printf(_(" %s promote [-D DATADIR] [-s] [-m PROMOTION-MODE]\n"), progname);
|
||||
printf(_(" %s kill SIGNALNAME PID\n"), progname);
|
||||
#if defined(WIN32) || defined(__CYGWIN__)
|
||||
printf(_(" %s register [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
|
||||
@ -1828,7 +1837,7 @@ do_help(void)
|
||||
printf(_(" -o OPTIONS command line options to pass to postgres\n"
|
||||
" (PostgreSQL server executable) or initdb\n"));
|
||||
printf(_(" -p PATH-TO-POSTGRES normally not necessary\n"));
|
||||
printf(_("\nOptions for stop or restart:\n"));
|
||||
printf(_("\nOptions for stop, restart or promote:\n"));
|
||||
printf(_(" -m, --mode=MODE MODE can be \"smart\", \"fast\", or \"immediate\"\n"));
|
||||
|
||||
printf(_("\nShutdown modes are:\n"));
|
||||
@ -1836,6 +1845,10 @@ do_help(void)
|
||||
printf(_(" fast quit directly, with proper shutdown\n"));
|
||||
printf(_(" immediate quit without complete shutdown; will lead to recovery on restart\n"));
|
||||
|
||||
printf(_("\nPromotion modes are:\n"));
|
||||
printf(_(" smart promote after performing a checkpoint\n"));
|
||||
printf(_(" fast promote quickly without waiting for checkpoint completion\n"));
|
||||
|
||||
printf(_("\nAllowed signal names for kill:\n"));
|
||||
printf(" ABRT HUP INT QUIT TERM USR1 USR2\n");
|
||||
|
||||
@ -2271,7 +2284,6 @@ main(int argc, char **argv)
|
||||
snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", pg_data);
|
||||
snprintf(backup_file, MAXPGPATH, "%s/backup_label", pg_data);
|
||||
snprintf(recovery_file, MAXPGPATH, "%s/recovery.conf", pg_data);
|
||||
snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
|
||||
}
|
||||
|
||||
switch (ctl_command)
|
||||
|
@ -217,6 +217,12 @@ typedef struct xl_restore_point
|
||||
char rp_name[MAXFNAMELEN];
|
||||
} xl_restore_point;
|
||||
|
||||
/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */
|
||||
typedef struct xl_end_of_recovery
|
||||
{
|
||||
TimestampTz end_time;
|
||||
TimeLineID ThisTimeLineID;
|
||||
} xl_end_of_recovery;
|
||||
|
||||
/*
|
||||
* XLogRecord is defined in xlog.h, but we avoid #including that to keep
|
||||
|
@ -64,6 +64,7 @@ typedef struct CheckPoint
|
||||
#define XLOG_PARAMETER_CHANGE 0x60
|
||||
#define XLOG_RESTORE_POINT 0x70
|
||||
#define XLOG_FPW_CHANGE 0x80
|
||||
#define XLOG_END_OF_RECOVERY 0x90
|
||||
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user