Add test case for an archive recovery corner case.

While I was working on a patch to refactor things around xlog.c, I mixed
up EndOfLogTLI and replayTLI at the end of recovery. As a result, if you
recovered to a point with a lower-numbered timeline in a WAL segment
that has a higher TLI in the filename, the end-of-recovery WAL record
was created with invalid PrevTimeLineId. I noticed that while
self-reviewing, but no tests failed. So add a test to cover that corner
case.

Thanks to Amul Sul who also submitted a test case for the same corner
case, although this patch is different from that.

Reviewed-by: Amul Sul, Michael Paquier
Discussion: https://www.postgresql.org/message-id/52bc9ccd-8591-431b-0086-15d9acf25a3f@iki.fi
Discussion: https://www.postgresql.org/message-id/CAAJ_b94Vjt5cXGza_1MkjLQWciNdEemsmiWuQj0d%3DM7JfjAa1g%40mail.gmail.com
This commit is contained in:
Heikki Linnakangas 2022-02-14 11:33:57 +02:00
parent 1383d52faa
commit 50e5bc582a

View File

@ -0,0 +1,176 @@
# Copyright (c) 2022, PostgreSQL Global Development Group
# Test recovering to a point-in-time using WAL archive, such that the
# target point is physically in a WAL segment with a higher TLI than
# the target point's TLI. For example, imagine that the following WAL
# segments exist in the WAL archive:
#
# 000000010000000000000001
# 000000010000000000000002
# 000000020000000000000003
#
# The timeline switch happened in the middle of WAL segment 3, but it
# was never archived on timeline 1. The first half of
# 000000020000000000000003 contains the WAL from timeline 1 up to the
# point where the timeline switch happened. If you now perform
# archive recovery with recovery target point in that first half of
# segment 3, archive recovery will find the WAL up to that point in
# segment 000000020000000000000003, but it will not follow the
# timeline switch to timeline 2, and creates a timeline switching
# end-of-recovery record with TLI 1 -> 3. That's what this test case
# tests.
#
# The comments below contain lists of WAL segments at different points
# in the tests, to make it easier to follow along. They are correct
# as of this writing, but the exact WAL segment numbers could change
# if the backend logic for when it switches to a new segment changes.
# The actual checks are not sensitive to that.
use strict;
use warnings;
use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;
use File::Compare;
# Initialize and start primary node with WAL archiving
my $node_primary = PostgreSQL::Test::Cluster->new('primary');
$node_primary->init(has_archiving => 1, allows_streaming => 1);
$node_primary->start;
# Take a backup.
my $backup_name = 'my_backup';
$node_primary->backup($backup_name);
# Workload with some transactions, and the target restore point.
$node_primary->psql(
'postgres', qq{
CREATE TABLE foo(i int);
INSERT INTO foo VALUES(1);
SELECT pg_create_restore_point('rp');
INSERT INTO foo VALUES(2);
});
# Contents of the WAL archive at this point:
#
# 000000010000000000000001
# 000000010000000000000002
# 000000010000000000000002.00000028.backup
#
# The operations on the test table and the restore point went into WAL
# segment 3, but it hasn't been archived yet.
# Start a standby node, and wait for it to catch up.
my $node_standby = PostgreSQL::Test::Cluster->new('standby');
$node_standby->init_from_backup(
$node_primary, $backup_name,
standby => 1,
has_streaming => 1,
has_archiving => 1,
has_restoring => 0);
$node_standby->append_conf('postgresql.conf', 'archive_mode = always');
$node_standby->start;
$node_primary->wait_for_catchup($node_standby);
# Check that it's really caught up.
my $result = $node_standby->safe_psql('postgres', "SELECT max(i) FROM foo;");
is($result, qq{2}, "check table contents after archive recovery");
# Kill the old primary, before it archives the most recent WAL segment that
# contains all the INSERTs.
$node_primary->stop('immediate');
# Promote the standby, and switch WAL so that it archives a WAL segment
# that contains all the INSERTs, on a new timeline.
$node_standby->promote;
# Find next WAL segment to be archived.
my $walfile_to_be_archived = $node_standby->safe_psql('postgres',
"SELECT pg_walfile_name(pg_current_wal_lsn());");
# Make WAL segment eligible for archival
$node_standby->safe_psql('postgres', 'SELECT pg_switch_wal()');
# Wait until the WAL segment has been archived.
my $archive_wait_query =
"SELECT '$walfile_to_be_archived' <= last_archived_wal FROM pg_stat_archiver;";
$node_standby->poll_query_until('postgres', $archive_wait_query)
or die "Timed out while waiting for WAL segment to be archived";
my $last_archived_wal_file = $walfile_to_be_archived;
# Ok, the standby has now archived the WAL on timeline 2. We don't
# need the standby anymore.
$node_standby->stop;
# Contents of the WAL archive at this point:
#
# 000000010000000000000001
# 000000010000000000000002
# 000000010000000000000002.00000028.backup
# 000000010000000000000003.partial
# 000000020000000000000003
# 00000002.history
#
# The operations on the test table and the restore point are in
# segment 3. They are part of timeline 1, but were not archived by
# the primary yet. However, they were copied into the beginning of
# segment 000000020000000000000003, before the timeline switching
# record. (They are also present in the
# 000000010000000000000003.partial file, but .partial files are not
# used automatically.)
# Now test PITR to the recovery target. It should find the WAL in
# segment 000000020000000000000003, but not follow the timeline switch
# to timeline 2.
my $node_pitr = PostgreSQL::Test::Cluster->new('node_pitr');
$node_pitr->init_from_backup(
$node_primary, $backup_name,
standby => 0,
has_restoring => 1);
$node_pitr->append_conf(
'postgresql.conf', qq{
recovery_target_name = 'rp'
recovery_target_action = 'promote'
});
$node_pitr->start;
# Wait until recovery finishes.
$node_pitr->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';")
or die "Timed out while waiting for PITR promotion";
# Check that we see the data we expect.
$result = $node_pitr->safe_psql('postgres', "SELECT max(i) FROM foo;");
is($result, qq{1}, "check table contents after point-in-time recovery");
# Insert a row so that we can check later that we successfully recover
# back to this timeline.
$node_pitr->safe_psql('postgres', "INSERT INTO foo VALUES(3);");
# Stop the node. This archives the last segment.
$node_pitr->stop();
# Test archive recovery on the timeline created by the PITR. This
# replays the end-of-recovery record that switches from timeline 1 to
# 3.
my $node_pitr2 = PostgreSQL::Test::Cluster->new('node_pitr2');
$node_pitr2->init_from_backup(
$node_primary, $backup_name,
standby => 0,
has_restoring => 1);
$node_pitr2->append_conf(
'postgresql.conf', qq{
recovery_target_action = 'promote'
});
$node_pitr2->start;
# Wait until recovery finishes.
$node_pitr2->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';")
or die "Timed out while waiting for PITR promotion";
# Verify that we can see the row inserted after the PITR.
$result = $node_pitr2->safe_psql('postgres', "SELECT max(i) FROM foo;");
is($result, qq{3}, "check table contents after point-in-time recovery");
done_testing();