From 9c4f5192f69ed16c99e0d079f0b5faebd7bad212 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 12 Nov 2020 14:52:24 +0200 Subject: [PATCH] Allow pg_rewind to use a standby server as the source system. Using a hot standby server as the source has not been possible, because pg_rewind creates a temporary table in the source system, to hold the list of file ranges that need to be fetched. Refactor it to queue up the file fetch requests in pg_rewind's memory, so that the temporary table is no longer needed. Also update the logic to compute 'minRecoveryPoint' correctly, when the source is a standby server. Reviewed-by: Kyotaro Horiguchi, Soumyadeep Chakraborty Discussion: https://www.postgresql.org/message-id/0c5b3783-af52-3ee5-f8fa-6e794061f70d%40iki.fi --- doc/src/sgml/ref/pg_rewind.sgml | 2 +- src/bin/pg_rewind/libpq_source.c | 291 +++++++++++++++------- src/bin/pg_rewind/pg_rewind.c | 99 ++++++-- src/bin/pg_rewind/t/003_extrafiles.pl | 26 +- src/bin/pg_rewind/t/007_standby_source.pl | 174 +++++++++++++ 5 files changed, 484 insertions(+), 108 deletions(-) create mode 100644 src/bin/pg_rewind/t/007_standby_source.pl diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml index 43282e6016..07aae75d8b 100644 --- a/doc/src/sgml/ref/pg_rewind.sgml +++ b/doc/src/sgml/ref/pg_rewind.sgml @@ -173,7 +173,7 @@ PostgreSQL documentation with a role having sufficient permissions to execute the functions used by pg_rewind on the source server (see Notes section for details) or a superuser role. This option - requires the source server to be running and not in recovery mode. + requires the source server to be running and accepting connections. diff --git a/src/bin/pg_rewind/libpq_source.c b/src/bin/pg_rewind/libpq_source.c index c73e8bf470..47beba277a 100644 --- a/src/bin/pg_rewind/libpq_source.c +++ b/src/bin/pg_rewind/libpq_source.c @@ -14,30 +14,51 @@ #include "datapagemap.h" #include "file_ops.h" #include "filemap.h" +#include "lib/stringinfo.h" #include "pg_rewind.h" #include "port/pg_bswap.h" #include "rewind_source.h" /* - * Files are fetched max CHUNKSIZE bytes at a time. - * - * (This only applies to files that are copied in whole, or for truncated - * files where we copy the tail. Relation files, where we know the individual - * blocks that need to be fetched, are fetched in BLCKSZ chunks.) + * Files are fetched MAX_CHUNK_SIZE bytes at a time, and with a + * maximum of MAX_CHUNKS_PER_QUERY chunks in a single query. */ -#define CHUNKSIZE 1000000 +#define MAX_CHUNK_SIZE (1024 * 1024) +#define MAX_CHUNKS_PER_QUERY 1000 + +/* represents a request to fetch a piece of a file from the source */ +typedef struct +{ + const char *path; /* path relative to data directory root */ + off_t offset; + size_t length; +} fetch_range_request; typedef struct { rewind_source common; /* common interface functions */ PGconn *conn; - bool copy_started; + + /* + * Queue of chunks that have been requested with the queue_fetch_range() + * function, but have not been fetched from the remote server yet. + */ + int num_requests; + fetch_range_request request_queue[MAX_CHUNKS_PER_QUERY]; + + /* temporary space for process_queued_fetch_requests() */ + StringInfoData paths; + StringInfoData offsets; + StringInfoData lengths; } libpq_source; static void init_libpq_conn(PGconn *conn); static char *run_simple_query(PGconn *conn, const char *sql); static void run_simple_command(PGconn *conn, const char *sql); +static void appendArrayEscapedString(StringInfo buf, const char *str); + +static void process_queued_fetch_requests(libpq_source *src); /* public interface functions */ static void libpq_traverse_files(rewind_source *source, @@ -74,6 +95,10 @@ init_libpq_source(PGconn *conn) src->conn = conn; + initStringInfo(&src->paths); + initStringInfo(&src->offsets); + initStringInfo(&src->lengths); + return &src->common; } @@ -91,6 +116,12 @@ init_libpq_conn(PGconn *conn) run_simple_command(conn, "SET lock_timeout = 0"); run_simple_command(conn, "SET idle_in_transaction_session_timeout = 0"); + /* + * we don't intend to do any updates, put the connection in read-only mode + * to keep us honest + */ + run_simple_command(conn, "SET default_transaction_read_only = on"); + /* secure search_path */ res = PQexec(conn, ALWAYS_SECURE_SEARCH_PATH_SQL); if (PQresultStatus(res) != PGRES_TUPLES_OK) @@ -98,17 +129,6 @@ init_libpq_conn(PGconn *conn) PQresultErrorMessage(res)); PQclear(res); - /* - * Check that the server is not in hot standby mode. There is no - * fundamental reason that couldn't be made to work, but it doesn't - * currently because we use a temporary table. Better to check for it - * explicitly than error out, for a better error message. - */ - str = run_simple_query(conn, "SELECT pg_is_in_recovery()"); - if (strcmp(str, "f") != 0) - pg_fatal("source server must not be in recovery mode"); - pg_free(str); - /* * Also check that full_page_writes is enabled. We can get torn pages if * a page is modified while we read it with pg_read_binary_file(), and we @@ -118,6 +138,18 @@ init_libpq_conn(PGconn *conn) if (strcmp(str, "on") != 0) pg_fatal("full_page_writes must be enabled in the source server"); pg_free(str); + + /* Prepare a statement we'll use to fetch files */ + res = PQprepare(conn, "fetch_chunks_stmt", + "SELECT path, begin,\n" + " pg_read_binary_file(path, begin, len, true) AS chunk\n" + "FROM unnest ($1::text[], $2::int8[], $3::int4[]) as x(path, begin, len)", + 3, NULL); + + if (PQresultStatus(res) != PGRES_COMMAND_OK) + pg_fatal("could not prepare statement to fetch file contents: %s", + PQresultErrorMessage(res)); + PQclear(res); } /* @@ -283,94 +315,125 @@ libpq_queue_fetch_range(rewind_source *source, const char *path, off_t off, size_t len) { libpq_source *src = (libpq_source *) source; - uint64 begin = off; - uint64 end = off + len; /* - * On first call, create a temporary table, and start COPYing to it. - * We will load it with the list of blocks that we need to fetch. + * Does this request happen to be a continuation of the previous chunk? If + * so, merge it with the previous one. + * + * XXX: We use pointer equality to compare the path. That's good enough + * for our purposes; the caller always passes the same pointer for the + * same filename. If it didn't, we would fail to merge requests, but it + * wouldn't affect correctness. */ - if (!src->copy_started) + if (src->num_requests > 0) { - PGresult *res; + fetch_range_request *prev = &src->request_queue[src->num_requests - 1]; - run_simple_command(src->conn, "CREATE TEMPORARY TABLE fetchchunks(path text, begin int8, len int4)"); + if (prev->offset + prev->length == off && + prev->length < MAX_CHUNK_SIZE && + prev->path == path) + { + /* + * Extend the previous request to cover as much of this new + * request as possible, without exceeding MAX_CHUNK_SIZE. + */ + size_t thislen; - res = PQexec(src->conn, "COPY fetchchunks FROM STDIN"); - if (PQresultStatus(res) != PGRES_COPY_IN) - pg_fatal("could not send file list: %s", - PQresultErrorMessage(res)); - PQclear(res); + thislen = Min(len, MAX_CHUNK_SIZE - prev->length); + prev->length += thislen; - src->copy_started = true; + off += thislen; + len -= thislen; + + /* + * Fall through to create new requests for any remaining 'len' + * that didn't fit in the previous chunk. + */ + } } - /* - * Write the file range to a temporary table in the server. - * - * The range is sent to the server as a COPY formatted line, to be inserted - * into the 'fetchchunks' temporary table. The libpq_finish_fetch() uses - * the temporary table to actually fetch the data. - */ - - /* Split the range into CHUNKSIZE chunks */ - while (end - begin > 0) + /* Divide the request into pieces of MAX_CHUNK_SIZE bytes each */ + while (len > 0) { - char linebuf[MAXPGPATH + 23]; - unsigned int len; + int32 thislen; - /* Fine as long as CHUNKSIZE is not bigger than UINT32_MAX */ - if (end - begin > CHUNKSIZE) - len = CHUNKSIZE; - else - len = (unsigned int) (end - begin); + /* if the queue is full, perform all the work queued up so far */ + if (src->num_requests == MAX_CHUNKS_PER_QUERY) + process_queued_fetch_requests(src); - snprintf(linebuf, sizeof(linebuf), "%s\t" UINT64_FORMAT "\t%u\n", path, begin, len); + thislen = Min(len, MAX_CHUNK_SIZE); + src->request_queue[src->num_requests].path = path; + src->request_queue[src->num_requests].offset = off; + src->request_queue[src->num_requests].length = thislen; + src->num_requests++; - if (PQputCopyData(src->conn, linebuf, strlen(linebuf)) != 1) - pg_fatal("could not send COPY data: %s", - PQerrorMessage(src->conn)); - - begin += len; + off += thislen; + len -= thislen; } } /* - * Receive all the queued chunks and write them to the target data directory. + * Fetch all the queued chunks and write them to the target data directory. */ static void libpq_finish_fetch(rewind_source *source) { - libpq_source *src = (libpq_source *) source; + process_queued_fetch_requests((libpq_source *) source); +} + +static void +process_queued_fetch_requests(libpq_source *src) +{ + const char *params[3]; PGresult *res; - const char *sql; + int chunkno; - if (PQputCopyEnd(src->conn, NULL) != 1) - pg_fatal("could not send end-of-COPY: %s", - PQerrorMessage(src->conn)); + if (src->num_requests == 0) + return; - while ((res = PQgetResult(src->conn)) != NULL) - { - if (PQresultStatus(res) != PGRES_COMMAND_OK) - pg_fatal("unexpected result while sending file list: %s", - PQresultErrorMessage(res)); - PQclear(res); - } + pg_log_debug("getting %d file chunks", src->num_requests); /* - * We've now copied the list of file ranges that we need to fetch to the - * temporary table. Now, actually fetch all of those ranges. + * The prepared statement, 'fetch_chunks_stmt', takes three arrays with + * the same length as parameters: paths, offsets and lengths. Construct + * the string representations of them. */ - sql = - "SELECT path, begin,\n" - " pg_read_binary_file(path, begin, len, true) AS chunk\n" - "FROM fetchchunks\n"; + resetStringInfo(&src->paths); + resetStringInfo(&src->offsets); + resetStringInfo(&src->lengths); - if (PQsendQueryParams(src->conn, sql, 0, NULL, NULL, NULL, NULL, 1) != 1) + appendStringInfoChar(&src->paths, '{'); + appendStringInfoChar(&src->offsets, '{'); + appendStringInfoChar(&src->lengths, '{'); + for (int i = 0; i < src->num_requests; i++) + { + fetch_range_request *rq = &src->request_queue[i]; + + if (i > 0) + { + appendStringInfoChar(&src->paths, ','); + appendStringInfoChar(&src->offsets, ','); + appendStringInfoChar(&src->lengths, ','); + } + + appendArrayEscapedString(&src->paths, rq->path); + appendStringInfo(&src->offsets, INT64_FORMAT, (int64) rq->offset); + appendStringInfo(&src->lengths, INT64_FORMAT, (int64) rq->length); + } + appendStringInfoChar(&src->paths, '}'); + appendStringInfoChar(&src->offsets, '}'); + appendStringInfoChar(&src->lengths, '}'); + + /* + * Execute the prepared statement. + */ + params[0] = src->paths.data; + params[1] = src->offsets.data; + params[2] = src->lengths.data; + + if (PQsendQueryPrepared(src->conn, "fetch_chunks_stmt", 3, params, NULL, NULL, 1) != 1) pg_fatal("could not send query: %s", PQerrorMessage(src->conn)); - pg_log_debug("getting file chunks"); - if (PQsetSingleRowMode(src->conn) != 1) pg_fatal("could not set libpq connection to single row mode"); @@ -382,8 +445,10 @@ libpq_finish_fetch(rewind_source *source) * chunk bytea -- file content *---- */ + chunkno = 0; while ((res = PQgetResult(src->conn)) != NULL) { + fetch_range_request *rq = &src->request_queue[chunkno]; char *filename; int filenamelen; int64 chunkoff; @@ -404,6 +469,9 @@ libpq_finish_fetch(rewind_source *source) PQresultErrorMessage(res)); } + if (chunkno > src->num_requests) + pg_fatal("received more data chunks than requested"); + /* sanity check the result set */ if (PQnfields(res) != 3 || PQntuples(res) != 1) pg_fatal("unexpected result set size while fetching remote files"); @@ -448,31 +516,74 @@ libpq_finish_fetch(rewind_source *source) * If a file has been deleted on the source, remove it on the target * as well. Note that multiple unlink() calls may happen on the same * file if multiple data chunks are associated with it, hence ignore - * unconditionally anything missing. If this file is not a relation - * data file, then it has been already truncated when creating the - * file chunk list at the previous execution of the filemap. + * unconditionally anything missing. */ if (PQgetisnull(res, 0, 2)) { pg_log_debug("received null value for chunk for file \"%s\", file has been deleted", filename); remove_target_file(filename, true); - pg_free(filename); - PQclear(res); - continue; } + else + { + pg_log_debug("received chunk for file \"%s\", offset " INT64_FORMAT ", size %d", + filename, chunkoff, chunksize); - pg_log_debug("received chunk for file \"%s\", offset " INT64_FORMAT ", size %d", - filename, chunkoff, chunksize); + if (strcmp(filename, rq->path) != 0) + { + pg_fatal("received data for file \"%s\", when requested for \"%s\"", + filename, rq->path); + } + if (chunkoff != rq->offset) + pg_fatal("received data at offset " INT64_FORMAT " of file \"%s\", when requested for offset " INT64_FORMAT, + chunkoff, rq->path, (int64) rq->offset); - open_target_file(filename, false); + /* + * We should not receive receive more data than we requested, or + * pg_read_binary_file() messed up. We could receive less, + * though, if the file was truncated in the source after we + * checked its size. That's OK, there should be a WAL record of + * the truncation, which will get replayed when you start the + * target system for the first time after pg_rewind has completed. + */ + if (chunksize > rq->length) + pg_fatal("received more than requested for file \"%s\"", rq->path); - write_target_range(chunk, chunkoff, chunksize); + open_target_file(filename, false); + + write_target_range(chunk, chunkoff, chunksize); + } pg_free(filename); PQclear(res); + chunkno++; } + if (chunkno != src->num_requests) + pg_fatal("unexpected number of data chunks received"); + + src->num_requests = 0; +} + +/* + * Escape a string to be used as element in a text array constant + */ +static void +appendArrayEscapedString(StringInfo buf, const char *str) +{ + appendStringInfoCharMacro(buf, '\"'); + while (*str) + { + char ch = *str; + + if (ch == '"' || ch == '\\') + appendStringInfoCharMacro(buf, '\\'); + + appendStringInfoCharMacro(buf, ch); + + str++; + } + appendStringInfoCharMacro(buf, '\"'); } /* @@ -521,6 +632,12 @@ libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize) static void libpq_destroy(rewind_source *source) { - pfree(source); + libpq_source *src = (libpq_source *) source; + + pfree(src->paths.data); + pfree(src->offsets.data); + pfree(src->lengths.data); + pfree(src); + /* NOTE: we don't close the connection here, as it was not opened by us. */ } diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 52e3fc40e8..2bbf8e7438 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -50,6 +50,7 @@ static void disconnect_atexit(void); static ControlFileData ControlFile_target; static ControlFileData ControlFile_source; +static ControlFileData ControlFile_source_after; const char *progname; int WalSegSz; @@ -486,6 +487,8 @@ perform_rewind(filemap_t *filemap, rewind_source *source, XLogRecPtr endrec; TimeLineID endtli; ControlFileData ControlFile_new; + size_t size; + char *buffer; /* * Execute the actions in the file map, fetching data from the source @@ -552,40 +555,104 @@ perform_rewind(filemap_t *filemap, rewind_source *source, } } - /* - * We've now copied the list of file ranges that we need to fetch to the - * temporary table. Now, actually fetch all of those ranges. - */ + /* Complete any remaining range-fetches that we queued up above. */ source->finish_fetch(source); close_target_file(); progress_report(true); + /* + * Fetch the control file from the source last. This ensures that the + * minRecoveryPoint is up-to-date. + */ + buffer = source->fetch_file(source, "global/pg_control", &size); + digestControlFile(&ControlFile_source_after, buffer, size); + pg_free(buffer); + + /* + * Sanity check: If the source is a local system, the control file should + * not have changed since we started. + * + * XXX: We assume it hasn't been modified, but actually, what could go + * wrong? The logic handles a libpq source that's modified concurrently, + * why not a local datadir? + */ + if (datadir_source && + memcmp(&ControlFile_source, &ControlFile_source_after, + sizeof(ControlFileData)) != 0) + { + pg_fatal("source system was modified while pg_rewind was running"); + } + if (showprogress) pg_log_info("creating backup label and updating control file"); + + /* + * Create a backup label file, to tell the target where to begin the WAL + * replay. Normally, from the last common checkpoint between the source + * and the target. But if the source is a standby server, it's possible + * that the last common checkpoint is *after* the standby's restartpoint. + * That implies that the source server has applied the checkpoint record, + * but hasn't perfomed a corresponding restartpoint yet. Make sure we + * start at the restartpoint's redo point in that case. + * + * Use the old version of the source's control file for this. The server + * might have finished the restartpoint after we started copying files, + * but we must begin from the redo point at the time that started copying. + */ + if (ControlFile_source.checkPointCopy.redo < chkptredo) + { + chkptredo = ControlFile_source.checkPointCopy.redo; + chkpttli = ControlFile_source.checkPointCopy.ThisTimeLineID; + chkptrec = ControlFile_source.checkPoint; + } createBackupLabel(chkptredo, chkpttli, chkptrec); /* - * Update control file of target. Make it ready to perform archive - * recovery when restarting. - * - * Like in an online backup, it's important that we replay all the WAL - * that was generated while we copied the files over. To enforce that, set - * 'minRecoveryPoint' in the control file. + * Update control file of target, to tell the target how far it must + * replay the WAL (minRecoveryPoint). */ - memcpy(&ControlFile_new, &ControlFile_source, sizeof(ControlFileData)); - if (connstr_source) { - endrec = source->get_current_wal_insert_lsn(source); - endtli = ControlFile_source.checkPointCopy.ThisTimeLineID; + /* + * The source is a live server. Like in an online backup, it's + * important that we recover all the WAL that was generated while we + * were copying files. + */ + if (ControlFile_source_after.state == DB_IN_ARCHIVE_RECOVERY) + { + /* + * Source is a standby server. We must replay to its + * minRecoveryPoint. + */ + endrec = ControlFile_source_after.minRecoveryPoint; + endtli = ControlFile_source_after.minRecoveryPointTLI; + } + else + { + /* + * Source is a production, non-standby, server. We must replay to + * the last WAL insert location. + */ + if (ControlFile_source_after.state != DB_IN_PRODUCTION) + pg_fatal("source system was in unexpected state at end of rewind"); + + endrec = source->get_current_wal_insert_lsn(source); + endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID; + } } else { - endrec = ControlFile_source.checkPoint; - endtli = ControlFile_source.checkPointCopy.ThisTimeLineID; + /* + * Source is a local data directory. It should've shut down cleanly, + * and we must replay to the latest shutdown checkpoint. + */ + endrec = ControlFile_source_after.checkPoint; + endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID; } + + memcpy(&ControlFile_new, &ControlFile_source_after, sizeof(ControlFileData)); ControlFile_new.minRecoveryPoint = endrec; ControlFile_new.minRecoveryPointTLI = endtli; ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY; diff --git a/src/bin/pg_rewind/t/003_extrafiles.pl b/src/bin/pg_rewind/t/003_extrafiles.pl index 48849fb49a..ece6ba2e43 100644 --- a/src/bin/pg_rewind/t/003_extrafiles.pl +++ b/src/bin/pg_rewind/t/003_extrafiles.pl @@ -40,10 +40,22 @@ sub run_test "in standby1"; append_to_file "$test_standby_datadir/tst_standby_dir/standby_file2", "in standby2"; + append_to_file + "$test_standby_datadir/tst_standby_dir/standby_file3 with 'quotes'", + "in standby3"; + append_to_file + "$test_standby_datadir/tst_standby_dir/standby_file4 with double\"quote", + "in standby4"; + append_to_file + "$test_standby_datadir/tst_standby_dir/standby_file5 with back\\slash", + "in standby5"; + append_to_file + "$test_standby_datadir/tst_standby_dir/standby_file6_with_backslash\\\"and_double-quote", + "in standby6"; mkdir "$test_standby_datadir/tst_standby_dir/standby_subdir/"; append_to_file - "$test_standby_datadir/tst_standby_dir/standby_subdir/standby_file3", - "in standby3"; + "$test_standby_datadir/tst_standby_dir/standby_subdir/standby_file7", + "in standby7"; mkdir "$test_primary_datadir/tst_primary_dir"; append_to_file "$test_primary_datadir/tst_primary_dir/primary_file1", @@ -58,7 +70,9 @@ sub run_test RewindTest::promote_standby(); RewindTest::run_pg_rewind($test_mode); - # List files in the data directory after rewind. + # List files in the data directory after rewind. All the files that + # were present in the standby should be present after rewind, and + # all the files that were added on the primary should be removed. my @paths; find( sub { @@ -78,8 +92,12 @@ sub run_test "$test_primary_datadir/tst_standby_dir", "$test_primary_datadir/tst_standby_dir/standby_file1", "$test_primary_datadir/tst_standby_dir/standby_file2", + "$test_primary_datadir/tst_standby_dir/standby_file3 with 'quotes'", + "$test_primary_datadir/tst_standby_dir/standby_file4 with double\"quote", + "$test_primary_datadir/tst_standby_dir/standby_file5 with back\\slash", + "$test_primary_datadir/tst_standby_dir/standby_file6_with_backslash\\\"and_double-quote", "$test_primary_datadir/tst_standby_dir/standby_subdir", - "$test_primary_datadir/tst_standby_dir/standby_subdir/standby_file3" + "$test_primary_datadir/tst_standby_dir/standby_subdir/standby_file7" ], "file lists match"); diff --git a/src/bin/pg_rewind/t/007_standby_source.pl b/src/bin/pg_rewind/t/007_standby_source.pl new file mode 100644 index 0000000000..64b7f1a3be --- /dev/null +++ b/src/bin/pg_rewind/t/007_standby_source.pl @@ -0,0 +1,174 @@ +# +# Test using a standby server as the source. +# +# This sets up three nodes: A, B and C. First, A is the primary, +# B follows A, and C follows B: +# +# A (primary) <--- B (standby) <--- C (standby) +# +# +# Then we promote C, and insert some divergent rows in A and C: +# +# A (primary) <--- B (standby) C (primary) +# +# +# Finally, we run pg_rewind on C, to re-point it at B again: +# +# A (primary) <--- B (standby) <--- C (standby) +# +# +# The test is similar to the basic tests, but since we're dealing with +# three nodes, not two, we cannot use most of the RewindTest functions +# as is. + +use strict; +use warnings; +use TestLib; +use Test::More tests => 3; + +use FindBin; +use lib $FindBin::RealBin; +use File::Copy; +use PostgresNode; +use RewindTest; + +my $tmp_folder = TestLib::tempdir; + +my $node_a; +my $node_b; +my $node_c; + +# Set up node A, as primary +# +# A (primary) + +setup_cluster('a'); +start_primary(); +$node_a = $node_primary; + +# Create a test table and insert a row in primary. +$node_a->safe_psql('postgres', "CREATE TABLE tbl1 (d text)"); +$node_a->safe_psql('postgres', "INSERT INTO tbl1 VALUES ('in A')"); +primary_psql("CHECKPOINT"); + +# Set up node B and C, as cascaded standbys +# +# A (primary) <--- B (standby) <--- C (standby) +$node_a->backup('my_backup'); +$node_b = get_new_node('node_b'); +$node_b->init_from_backup($node_a, 'my_backup', has_streaming => 1); +$node_b->set_standby_mode(); +$node_b->start; + +$node_b->backup('my_backup'); +$node_c = get_new_node('node_c'); +$node_c->init_from_backup($node_b, 'my_backup', has_streaming => 1); +$node_c->set_standby_mode(); +$node_c->start; + +# Insert additional data on A, and wait for both standbys to catch up. +$node_a->safe_psql('postgres', + "INSERT INTO tbl1 values ('in A, before promotion')"); +$node_a->safe_psql('postgres', 'CHECKPOINT'); + +my $lsn = $node_a->lsn('insert'); +$node_a->wait_for_catchup('node_b', 'write', $lsn); +$node_b->wait_for_catchup('node_c', 'write', $lsn); + +# Promote C +# +# A (primary) <--- B (standby) C (primary) + +$node_c->promote; +$node_c->safe_psql('postgres', "checkpoint"); + + +# Insert a row in A. This causes A/B and C to have "diverged", so that it's +# no longer possible to just apply the standy's logs over primary directory +# - you need to rewind. +$node_a->safe_psql('postgres', + "INSERT INTO tbl1 VALUES ('in A, after C was promoted')"); + +# Also insert a new row in the standby, which won't be present in the +# old primary. +$node_c->safe_psql('postgres', + "INSERT INTO tbl1 VALUES ('in C, after C was promoted')"); + + +# +# All set up. We're ready to run pg_rewind. +# +my $node_c_pgdata = $node_c->data_dir; + +# Stop the node and be ready to perform the rewind. +$node_c->stop('fast'); + +# Keep a temporary postgresql.conf or it would be overwritten during the rewind. +copy( + "$node_c_pgdata/postgresql.conf", + "$tmp_folder/node_c-postgresql.conf.tmp"); + +{ + # Temporarily unset PGAPPNAME so that the server doesn't + # inherit it. Otherwise this could affect libpqwalreceiver + # connections in confusing ways. + local %ENV = %ENV; + delete $ENV{PGAPPNAME}; + + # Do rewind using a remote connection as source, generating + # recovery configuration automatically. + command_ok( + [ + 'pg_rewind', "--debug", + "--source-server", $node_b->connstr('postgres'), + "--target-pgdata=$node_c_pgdata", "--no-sync", + "--write-recovery-conf" + ], + 'pg_rewind remote'); +} + +# Now move back postgresql.conf with old settings +move( + "$tmp_folder/node_c-postgresql.conf.tmp", + "$node_c_pgdata/postgresql.conf"); + +# Restart the node. +$node_c->start; + +# set RewindTest::node_primary to point to the rewinded node, so that we can +# use check_query() +$node_primary = $node_c; + +# Run some checks to verify that C has been successfully rewound, +# and connected back to follow B. + +check_query( + 'SELECT * FROM tbl1', + qq(in A +in A, before promotion +in A, after C was promoted +), + 'table content after rewind'); + +# Insert another row, and observe that it's cascaded from A to B to C. +$node_a->safe_psql('postgres', + "INSERT INTO tbl1 values ('in A, after rewind')"); + +$lsn = $node_a->lsn('insert'); +$node_b->wait_for_catchup('node_c', 'write', $lsn); + +check_query( + 'SELECT * FROM tbl1', + qq(in A +in A, before promotion +in A, after C was promoted +in A, after rewind +), + 'table content after rewind and insert'); + +# clean up +$node_a->teardown_node; +$node_b->teardown_node; +$node_c->teardown_node; + +exit(0);