Remove secondary checkpoint

Previously server reserved WAL for last two checkpoints,
which used too much disk space for small servers.

Bumps PG_CONTROL_VERSION

Author: Simon Riggs <simon@2ndQuadrant.com>
Reviewed-by: Michael Paquier <michael.paquier@gmail.com>
This commit is contained in:
Simon Riggs 2017-11-07 12:56:30 -05:00
parent 98267ee83e
commit 4b0d28de06
8 changed files with 61 additions and 125 deletions

View File

@ -568,7 +568,7 @@ tar -cf backup.tar /usr/local/pgsql/data
normally creates just a few segment files and then
<quote>recycles</quote> them by renaming no-longer-needed segment files
to higher segment numbers. It's assumed that segment files whose
contents precede the checkpoint-before-last are no longer of
contents precede the last checkpoint are no longer of
interest and can be recycled.
</para>

View File

@ -17948,11 +17948,6 @@ SELECT collation for ('foo' COLLATE "de_DE");
<entry><type>pg_lsn</type></entry>
</row>
<row>
<entry><literal>prior_lsn</literal></entry>
<entry><type>pg_lsn</type></entry>
</row>
<row>
<entry><literal>redo_lsn</literal></entry>
<entry><type>pg_lsn</type></entry>

View File

@ -2221,13 +2221,18 @@ CalculateCheckpointSegments(void)
* Calculate the distance at which to trigger a checkpoint, to avoid
* exceeding max_wal_size_mb. This is based on two assumptions:
*
* a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint.
* a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
* WAL for two checkpoint cycles to allow us to recover from the
* secondary checkpoint if the first checkpoint failed, though we
* only did this on the master anyway, not on standby. Keeping just
* one checkpoint simplifies processing and reduces disk space in
* many smaller databases.)
* b) during checkpoint, we consume checkpoint_completion_target *
* number of segments consumed between checkpoints.
*-------
*/
target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
(2.0 + CheckPointCompletionTarget);
(1.0 + CheckPointCompletionTarget);
/* round down */
CheckPointSegments = (int) target;
@ -2279,23 +2284,8 @@ XLOGfileslop(XLogRecPtr PriorRedoPtr)
* To estimate where the next checkpoint will finish, assume that the
* system runs steadily consuming CheckPointDistanceEstimate bytes between
* every checkpoint.
*
* The reason this calculation is done from the prior checkpoint, not the
* one that just finished, is that this behaves better if some checkpoint
* cycles are abnormally short, like if you perform a manual checkpoint
* right after a timed one. The manual checkpoint will make almost a full
* cycle's worth of WAL segments available for recycling, because the
* segments from the prior's prior, fully-sized checkpoint cycle are no
* longer needed. However, the next checkpoint will make only few segments
* available for recycling, the ones generated between the timed
* checkpoint and the manual one right after that. If at the manual
* checkpoint we only retained enough segments to get us to the next timed
* one, and removed the rest, then at the next checkpoint we would not
* have enough segments around for recycling, to get us to the checkpoint
* after that. Basing the calculations on the distance from the prior redo
* pointer largely fixes that problem.
*/
distance = (2.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
/* add 10% for good measure. */
distance *= 1.10;
@ -6593,30 +6583,17 @@ StartupXLOG(void)
(errmsg("checkpoint record is at %X/%X",
(uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
}
else if (StandbyMode)
else
{
/*
* The last valid checkpoint record required for a streaming
* recovery exists in neither standby nor the primary.
* We used to attempt to go back to a secondary checkpoint
* record here, but only when not in standby_mode. We now
* just fail if we can't read the last checkpoint because
* this allows us to simplify processing around checkpoints.
*/
ereport(PANIC,
(errmsg("could not locate a valid checkpoint record")));
}
else
{
checkPointLoc = ControlFile->prevCheckPoint;
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
if (record != NULL)
{
ereport(LOG,
(errmsg("using previous checkpoint record at %X/%X",
(uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
InRecovery = true; /* force recovery even if SHUTDOWNED */
}
else
ereport(PANIC,
(errmsg("could not locate a valid checkpoint record")));
}
memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
}
@ -6845,7 +6822,6 @@ StartupXLOG(void)
recoveryTargetTLI)));
ControlFile->state = DB_IN_CRASH_RECOVERY;
}
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = checkPointLoc;
ControlFile->checkPointCopy = checkPoint;
if (InArchiveRecovery)
@ -7619,12 +7595,11 @@ StartupXLOG(void)
{
if (fast_promote)
{
checkPointLoc = ControlFile->prevCheckPoint;
checkPointLoc = ControlFile->checkPoint;
/*
* Confirm the last checkpoint is available for us to recover
* from if we fail. Note that we don't check for the secondary
* checkpoint since that isn't available in most base backups.
* from if we fail.
*/
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
if (record != NULL)
@ -8090,7 +8065,7 @@ LocalSetXLogInsertAllowed(void)
* Subroutine to try to fetch and validate a prior checkpoint record.
*
* whichChkpt identifies the checkpoint (merely for reporting purposes).
* 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
* 1 for "primary", 0 for "other" (backup_label)
*/
static XLogRecord *
ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
@ -8110,10 +8085,6 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
ereport(LOG,
(errmsg("invalid primary checkpoint link in control file")));
break;
case 2:
ereport(LOG,
(errmsg("invalid secondary checkpoint link in control file")));
break;
default:
ereport(LOG,
(errmsg("invalid checkpoint link in backup_label file")));
@ -8135,10 +8106,6 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
ereport(LOG,
(errmsg("invalid primary checkpoint record")));
break;
case 2:
ereport(LOG,
(errmsg("invalid secondary checkpoint record")));
break;
default:
ereport(LOG,
(errmsg("invalid checkpoint record")));
@ -8154,10 +8121,6 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
ereport(LOG,
(errmsg("invalid resource manager ID in primary checkpoint record")));
break;
case 2:
ereport(LOG,
(errmsg("invalid resource manager ID in secondary checkpoint record")));
break;
default:
ereport(LOG,
(errmsg("invalid resource manager ID in checkpoint record")));
@ -8175,10 +8138,6 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
ereport(LOG,
(errmsg("invalid xl_info in primary checkpoint record")));
break;
case 2:
ereport(LOG,
(errmsg("invalid xl_info in secondary checkpoint record")));
break;
default:
ereport(LOG,
(errmsg("invalid xl_info in checkpoint record")));
@ -8194,10 +8153,6 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
ereport(LOG,
(errmsg("invalid length of primary checkpoint record")));
break;
case 2:
ereport(LOG,
(errmsg("invalid length of secondary checkpoint record")));
break;
default:
ereport(LOG,
(errmsg("invalid length of checkpoint record")));
@ -8933,8 +8888,7 @@ CreateCheckPoint(int flags)
(errmsg("concurrent write-ahead log activity while database system is shutting down")));
/*
* Remember the prior checkpoint's redo pointer, used later to determine
* the point where the log can be truncated.
* Remember the prior checkpoint's redo ptr for UpdateCheckPointDistanceEstimate()
*/
PriorRedoPtr = ControlFile->checkPointCopy.redo;
@ -8944,7 +8898,6 @@ CreateCheckPoint(int flags)
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (shutdown)
ControlFile->state = DB_SHUTDOWNED;
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = ProcLastRecPtr;
ControlFile->checkPointCopy = checkPoint;
ControlFile->time = (pg_time_t) time(NULL);
@ -8982,8 +8935,7 @@ CreateCheckPoint(int flags)
smgrpostckpt();
/*
* Delete old log files (those no longer needed even for previous
* checkpoint or the standbys in XLOG streaming).
* Delete old log files and recycle them
*/
if (PriorRedoPtr != InvalidXLogRecPtr)
{
@ -8992,7 +8944,8 @@ CreateCheckPoint(int flags)
/* Update the average distance between checkpoints. */
UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
XLByteToSeg(PriorRedoPtr, _logSegNo, wal_segment_size);
/* Trim from the last checkpoint, not the last - 1 */
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
KeepLogSeg(recptr, &_logSegNo);
_logSegNo--;
RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr);
@ -9258,8 +9211,7 @@ CreateRestartPoint(int flags)
CheckPointGuts(lastCheckPoint.redo, flags);
/*
* Remember the prior checkpoint's redo pointer, used later to determine
* the point at which we can truncate the log.
* Remember the prior checkpoint's redo ptr for UpdateCheckPointDistanceEstimate()
*/
PriorRedoPtr = ControlFile->checkPointCopy.redo;
@ -9273,7 +9225,6 @@ CreateRestartPoint(int flags)
if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
{
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = lastCheckPointRecPtr;
ControlFile->checkPointCopy = lastCheckPoint;
ControlFile->time = (pg_time_t) time(NULL);

View File

@ -93,41 +93,39 @@ pg_control_checkpoint(PG_FUNCTION_ARGS)
tupdesc = CreateTemplateTupleDesc(19, false);
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "checkpoint_lsn",
LSNOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prior_lsn",
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "redo_lsn",
LSNOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "redo_lsn",
LSNOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 4, "redo_wal_file",
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "redo_wal_file",
TEXTOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 5, "timeline_id",
TupleDescInitEntry(tupdesc, (AttrNumber) 4, "timeline_id",
INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 6, "prev_timeline_id",
TupleDescInitEntry(tupdesc, (AttrNumber) 5, "prev_timeline_id",
INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 7, "full_page_writes",
TupleDescInitEntry(tupdesc, (AttrNumber) 6, "full_page_writes",
BOOLOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 8, "next_xid",
TupleDescInitEntry(tupdesc, (AttrNumber) 7, "next_xid",
TEXTOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 9, "next_oid",
TupleDescInitEntry(tupdesc, (AttrNumber) 8, "next_oid",
OIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 10, "next_multixact_id",
TupleDescInitEntry(tupdesc, (AttrNumber) 9, "next_multixact_id",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 11, "next_multi_offset",
TupleDescInitEntry(tupdesc, (AttrNumber) 10, "next_multi_offset",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 12, "oldest_xid",
TupleDescInitEntry(tupdesc, (AttrNumber) 11, "oldest_xid",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 13, "oldest_xid_dbid",
TupleDescInitEntry(tupdesc, (AttrNumber) 12, "oldest_xid_dbid",
OIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 14, "oldest_active_xid",
TupleDescInitEntry(tupdesc, (AttrNumber) 13, "oldest_active_xid",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 15, "oldest_multi_xid",
TupleDescInitEntry(tupdesc, (AttrNumber) 14, "oldest_multi_xid",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 16, "oldest_multi_dbid",
TupleDescInitEntry(tupdesc, (AttrNumber) 15, "oldest_multi_dbid",
OIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 17, "oldest_commit_ts_xid",
TupleDescInitEntry(tupdesc, (AttrNumber) 16, "oldest_commit_ts_xid",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 18, "newest_commit_ts_xid",
TupleDescInitEntry(tupdesc, (AttrNumber) 17, "newest_commit_ts_xid",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 19, "checkpoint_time",
TupleDescInitEntry(tupdesc, (AttrNumber) 18, "checkpoint_time",
TIMESTAMPTZOID, -1, 0);
tupdesc = BlessTupleDesc(tupdesc);
@ -149,62 +147,59 @@ pg_control_checkpoint(PG_FUNCTION_ARGS)
values[0] = LSNGetDatum(ControlFile->checkPoint);
nulls[0] = false;
values[1] = LSNGetDatum(ControlFile->prevCheckPoint);
values[1] = LSNGetDatum(ControlFile->checkPointCopy.redo);
nulls[1] = false;
values[2] = LSNGetDatum(ControlFile->checkPointCopy.redo);
values[2] = CStringGetTextDatum(xlogfilename);
nulls[2] = false;
values[3] = CStringGetTextDatum(xlogfilename);
values[3] = Int32GetDatum(ControlFile->checkPointCopy.ThisTimeLineID);
nulls[3] = false;
values[4] = Int32GetDatum(ControlFile->checkPointCopy.ThisTimeLineID);
values[4] = Int32GetDatum(ControlFile->checkPointCopy.PrevTimeLineID);
nulls[4] = false;
values[5] = Int32GetDatum(ControlFile->checkPointCopy.PrevTimeLineID);
values[5] = BoolGetDatum(ControlFile->checkPointCopy.fullPageWrites);
nulls[5] = false;
values[6] = BoolGetDatum(ControlFile->checkPointCopy.fullPageWrites);
nulls[6] = false;
values[7] = CStringGetTextDatum(psprintf("%u:%u",
values[6] = CStringGetTextDatum(psprintf("%u:%u",
ControlFile->checkPointCopy.nextXidEpoch,
ControlFile->checkPointCopy.nextXid));
nulls[6] = false;
values[7] = ObjectIdGetDatum(ControlFile->checkPointCopy.nextOid);
nulls[7] = false;
values[8] = ObjectIdGetDatum(ControlFile->checkPointCopy.nextOid);
values[8] = TransactionIdGetDatum(ControlFile->checkPointCopy.nextMulti);
nulls[8] = false;
values[9] = TransactionIdGetDatum(ControlFile->checkPointCopy.nextMulti);
values[9] = TransactionIdGetDatum(ControlFile->checkPointCopy.nextMultiOffset);
nulls[9] = false;
values[10] = TransactionIdGetDatum(ControlFile->checkPointCopy.nextMultiOffset);
values[10] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestXid);
nulls[10] = false;
values[11] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestXid);
values[11] = ObjectIdGetDatum(ControlFile->checkPointCopy.oldestXidDB);
nulls[11] = false;
values[12] = ObjectIdGetDatum(ControlFile->checkPointCopy.oldestXidDB);
values[12] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestActiveXid);
nulls[12] = false;
values[13] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestActiveXid);
values[13] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestMulti);
nulls[13] = false;
values[14] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestMulti);
values[14] = ObjectIdGetDatum(ControlFile->checkPointCopy.oldestMultiDB);
nulls[14] = false;
values[15] = ObjectIdGetDatum(ControlFile->checkPointCopy.oldestMultiDB);
values[15] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestCommitTsXid);
nulls[15] = false;
values[16] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestCommitTsXid);
values[16] = TransactionIdGetDatum(ControlFile->checkPointCopy.newestCommitTsXid);
nulls[16] = false;
values[17] = TransactionIdGetDatum(ControlFile->checkPointCopy.newestCommitTsXid);
nulls[17] = false;
values[18] = TimestampTzGetDatum(
values[17] = TimestampTzGetDatum(
time_t_to_timestamptz(ControlFile->checkPointCopy.time));
nulls[18] = false;
nulls[17] = false;
htup = heap_form_tuple(tupdesc, values, nulls);

View File

@ -222,9 +222,6 @@ main(int argc, char *argv[])
printf(_("Latest checkpoint location: %X/%X\n"),
(uint32) (ControlFile->checkPoint >> 32),
(uint32) ControlFile->checkPoint);
printf(_("Prior checkpoint location: %X/%X\n"),
(uint32) (ControlFile->prevCheckPoint >> 32),
(uint32) ControlFile->prevCheckPoint);
printf(_("Latest checkpoint's REDO location: %X/%X\n"),
(uint32) (ControlFile->checkPointCopy.redo >> 32),
(uint32) ControlFile->checkPointCopy.redo);

View File

@ -876,7 +876,6 @@ RewriteControlFile(void)
ControlFile.state = DB_SHUTDOWNED;
ControlFile.time = (pg_time_t) time(NULL);
ControlFile.checkPoint = ControlFile.checkPointCopy.redo;
ControlFile.prevCheckPoint = 0;
ControlFile.minRecoveryPoint = 0;
ControlFile.minRecoveryPointTLI = 0;
ControlFile.backupStartPoint = 0;

View File

@ -21,7 +21,7 @@
/* Version identifier for this pg_control format */
#define PG_CONTROL_VERSION 1003
#define PG_CONTROL_VERSION 1100
/* Nonce key length, see below */
#define MOCK_AUTH_NONCE_LEN 32
@ -127,7 +127,6 @@ typedef struct ControlFileData
DBState state; /* see enum above */
pg_time_t time; /* time stamp of last pg_control update */
XLogRecPtr checkPoint; /* last check point record ptr */
XLogRecPtr prevCheckPoint; /* previous check point record ptr */
CheckPoint checkPointCopy; /* copy of last check point record */

View File

@ -5500,7 +5500,7 @@ DESCR("pg_config binary as a function");
DATA(insert OID = 3441 ( pg_control_system PGNSP PGUID 12 1 0 0 0 f f f f t f v s 0 0 2249 "" "{23,23,20,1184}" "{o,o,o,o}" "{pg_control_version,catalog_version_no,system_identifier,pg_control_last_modified}" _null_ _null_ pg_control_system _null_ _null_ _null_ ));
DESCR("pg_controldata general state information as a function");
DATA(insert OID = 3442 ( pg_control_checkpoint PGNSP PGUID 12 1 0 0 0 f f f f t f v s 0 0 2249 "" "{3220,3220,3220,25,23,23,16,25,26,28,28,28,26,28,28,26,28,28,1184}" "{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}" "{checkpoint_lsn,prior_lsn,redo_lsn,redo_wal_file,timeline_id,prev_timeline_id,full_page_writes,next_xid,next_oid,next_multixact_id,next_multi_offset,oldest_xid,oldest_xid_dbid,oldest_active_xid,oldest_multi_xid,oldest_multi_dbid,oldest_commit_ts_xid,newest_commit_ts_xid,checkpoint_time}" _null_ _null_ pg_control_checkpoint _null_ _null_ _null_ ));
DATA(insert OID = 3442 ( pg_control_checkpoint PGNSP PGUID 12 1 0 0 0 f f f f t f v s 0 0 2249 "" "{3220,3220,25,23,23,16,25,26,28,28,28,26,28,28,26,28,28,1184}" "{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}" "{checkpoint_lsn,redo_lsn,redo_wal_file,timeline_id,prev_timeline_id,full_page_writes,next_xid,next_oid,next_multixact_id,next_multi_offset,oldest_xid,oldest_xid_dbid,oldest_active_xid,oldest_multi_xid,oldest_multi_dbid,oldest_commit_ts_xid,newest_commit_ts_xid,checkpoint_time}" _null_ _null_ pg_control_checkpoint _null_ _null_ _null_ ));
DESCR("pg_controldata checkpoint state information as a function");
DATA(insert OID = 3443 ( pg_control_recovery PGNSP PGUID 12 1 0 0 0 f f f f t f v s 0 0 2249 "" "{3220,23,3220,3220,16}" "{o,o,o,o,o}" "{min_recovery_end_lsn,min_recovery_end_timeline,backup_start_lsn,backup_end_lsn,end_of_backup_record_required}" _null_ _null_ pg_control_recovery _null_ _null_ _null_ ));