mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-02-05 19:09:58 +08:00
Allow pg_rewind to use a standby server as the source system.
Using a hot standby server as the source has not been possible, because pg_rewind creates a temporary table in the source system, to hold the list of file ranges that need to be fetched. Refactor it to queue up the file fetch requests in pg_rewind's memory, so that the temporary table is no longer needed. Also update the logic to compute 'minRecoveryPoint' correctly, when the source is a standby server. Reviewed-by: Kyotaro Horiguchi, Soumyadeep Chakraborty Discussion: https://www.postgresql.org/message-id/0c5b3783-af52-3ee5-f8fa-6e794061f70d%40iki.fi
This commit is contained in:
parent
1b2b19f758
commit
9c4f5192f6
@ -173,7 +173,7 @@ PostgreSQL documentation
|
|||||||
with a role having sufficient permissions to execute the functions
|
with a role having sufficient permissions to execute the functions
|
||||||
used by <application>pg_rewind</application> on the source server
|
used by <application>pg_rewind</application> on the source server
|
||||||
(see Notes section for details) or a superuser role. This option
|
(see Notes section for details) or a superuser role. This option
|
||||||
requires the source server to be running and not in recovery mode.
|
requires the source server to be running and accepting connections.
|
||||||
</para>
|
</para>
|
||||||
</listitem>
|
</listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
@ -14,30 +14,51 @@
|
|||||||
#include "datapagemap.h"
|
#include "datapagemap.h"
|
||||||
#include "file_ops.h"
|
#include "file_ops.h"
|
||||||
#include "filemap.h"
|
#include "filemap.h"
|
||||||
|
#include "lib/stringinfo.h"
|
||||||
#include "pg_rewind.h"
|
#include "pg_rewind.h"
|
||||||
#include "port/pg_bswap.h"
|
#include "port/pg_bswap.h"
|
||||||
#include "rewind_source.h"
|
#include "rewind_source.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Files are fetched max CHUNKSIZE bytes at a time.
|
* Files are fetched MAX_CHUNK_SIZE bytes at a time, and with a
|
||||||
*
|
* maximum of MAX_CHUNKS_PER_QUERY chunks in a single query.
|
||||||
* (This only applies to files that are copied in whole, or for truncated
|
|
||||||
* files where we copy the tail. Relation files, where we know the individual
|
|
||||||
* blocks that need to be fetched, are fetched in BLCKSZ chunks.)
|
|
||||||
*/
|
*/
|
||||||
#define CHUNKSIZE 1000000
|
#define MAX_CHUNK_SIZE (1024 * 1024)
|
||||||
|
#define MAX_CHUNKS_PER_QUERY 1000
|
||||||
|
|
||||||
|
/* represents a request to fetch a piece of a file from the source */
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
const char *path; /* path relative to data directory root */
|
||||||
|
off_t offset;
|
||||||
|
size_t length;
|
||||||
|
} fetch_range_request;
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
rewind_source common; /* common interface functions */
|
rewind_source common; /* common interface functions */
|
||||||
|
|
||||||
PGconn *conn;
|
PGconn *conn;
|
||||||
bool copy_started;
|
|
||||||
|
/*
|
||||||
|
* Queue of chunks that have been requested with the queue_fetch_range()
|
||||||
|
* function, but have not been fetched from the remote server yet.
|
||||||
|
*/
|
||||||
|
int num_requests;
|
||||||
|
fetch_range_request request_queue[MAX_CHUNKS_PER_QUERY];
|
||||||
|
|
||||||
|
/* temporary space for process_queued_fetch_requests() */
|
||||||
|
StringInfoData paths;
|
||||||
|
StringInfoData offsets;
|
||||||
|
StringInfoData lengths;
|
||||||
} libpq_source;
|
} libpq_source;
|
||||||
|
|
||||||
static void init_libpq_conn(PGconn *conn);
|
static void init_libpq_conn(PGconn *conn);
|
||||||
static char *run_simple_query(PGconn *conn, const char *sql);
|
static char *run_simple_query(PGconn *conn, const char *sql);
|
||||||
static void run_simple_command(PGconn *conn, const char *sql);
|
static void run_simple_command(PGconn *conn, const char *sql);
|
||||||
|
static void appendArrayEscapedString(StringInfo buf, const char *str);
|
||||||
|
|
||||||
|
static void process_queued_fetch_requests(libpq_source *src);
|
||||||
|
|
||||||
/* public interface functions */
|
/* public interface functions */
|
||||||
static void libpq_traverse_files(rewind_source *source,
|
static void libpq_traverse_files(rewind_source *source,
|
||||||
@ -74,6 +95,10 @@ init_libpq_source(PGconn *conn)
|
|||||||
|
|
||||||
src->conn = conn;
|
src->conn = conn;
|
||||||
|
|
||||||
|
initStringInfo(&src->paths);
|
||||||
|
initStringInfo(&src->offsets);
|
||||||
|
initStringInfo(&src->lengths);
|
||||||
|
|
||||||
return &src->common;
|
return &src->common;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -91,6 +116,12 @@ init_libpq_conn(PGconn *conn)
|
|||||||
run_simple_command(conn, "SET lock_timeout = 0");
|
run_simple_command(conn, "SET lock_timeout = 0");
|
||||||
run_simple_command(conn, "SET idle_in_transaction_session_timeout = 0");
|
run_simple_command(conn, "SET idle_in_transaction_session_timeout = 0");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* we don't intend to do any updates, put the connection in read-only mode
|
||||||
|
* to keep us honest
|
||||||
|
*/
|
||||||
|
run_simple_command(conn, "SET default_transaction_read_only = on");
|
||||||
|
|
||||||
/* secure search_path */
|
/* secure search_path */
|
||||||
res = PQexec(conn, ALWAYS_SECURE_SEARCH_PATH_SQL);
|
res = PQexec(conn, ALWAYS_SECURE_SEARCH_PATH_SQL);
|
||||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
@ -98,17 +129,6 @@ init_libpq_conn(PGconn *conn)
|
|||||||
PQresultErrorMessage(res));
|
PQresultErrorMessage(res));
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
|
|
||||||
/*
|
|
||||||
* Check that the server is not in hot standby mode. There is no
|
|
||||||
* fundamental reason that couldn't be made to work, but it doesn't
|
|
||||||
* currently because we use a temporary table. Better to check for it
|
|
||||||
* explicitly than error out, for a better error message.
|
|
||||||
*/
|
|
||||||
str = run_simple_query(conn, "SELECT pg_is_in_recovery()");
|
|
||||||
if (strcmp(str, "f") != 0)
|
|
||||||
pg_fatal("source server must not be in recovery mode");
|
|
||||||
pg_free(str);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Also check that full_page_writes is enabled. We can get torn pages if
|
* Also check that full_page_writes is enabled. We can get torn pages if
|
||||||
* a page is modified while we read it with pg_read_binary_file(), and we
|
* a page is modified while we read it with pg_read_binary_file(), and we
|
||||||
@ -118,6 +138,18 @@ init_libpq_conn(PGconn *conn)
|
|||||||
if (strcmp(str, "on") != 0)
|
if (strcmp(str, "on") != 0)
|
||||||
pg_fatal("full_page_writes must be enabled in the source server");
|
pg_fatal("full_page_writes must be enabled in the source server");
|
||||||
pg_free(str);
|
pg_free(str);
|
||||||
|
|
||||||
|
/* Prepare a statement we'll use to fetch files */
|
||||||
|
res = PQprepare(conn, "fetch_chunks_stmt",
|
||||||
|
"SELECT path, begin,\n"
|
||||||
|
" pg_read_binary_file(path, begin, len, true) AS chunk\n"
|
||||||
|
"FROM unnest ($1::text[], $2::int8[], $3::int4[]) as x(path, begin, len)",
|
||||||
|
3, NULL);
|
||||||
|
|
||||||
|
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||||
|
pg_fatal("could not prepare statement to fetch file contents: %s",
|
||||||
|
PQresultErrorMessage(res));
|
||||||
|
PQclear(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -283,94 +315,125 @@ libpq_queue_fetch_range(rewind_source *source, const char *path, off_t off,
|
|||||||
size_t len)
|
size_t len)
|
||||||
{
|
{
|
||||||
libpq_source *src = (libpq_source *) source;
|
libpq_source *src = (libpq_source *) source;
|
||||||
uint64 begin = off;
|
|
||||||
uint64 end = off + len;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* On first call, create a temporary table, and start COPYing to it.
|
* Does this request happen to be a continuation of the previous chunk? If
|
||||||
* We will load it with the list of blocks that we need to fetch.
|
* so, merge it with the previous one.
|
||||||
|
*
|
||||||
|
* XXX: We use pointer equality to compare the path. That's good enough
|
||||||
|
* for our purposes; the caller always passes the same pointer for the
|
||||||
|
* same filename. If it didn't, we would fail to merge requests, but it
|
||||||
|
* wouldn't affect correctness.
|
||||||
*/
|
*/
|
||||||
if (!src->copy_started)
|
if (src->num_requests > 0)
|
||||||
{
|
{
|
||||||
PGresult *res;
|
fetch_range_request *prev = &src->request_queue[src->num_requests - 1];
|
||||||
|
|
||||||
run_simple_command(src->conn, "CREATE TEMPORARY TABLE fetchchunks(path text, begin int8, len int4)");
|
if (prev->offset + prev->length == off &&
|
||||||
|
prev->length < MAX_CHUNK_SIZE &&
|
||||||
|
prev->path == path)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Extend the previous request to cover as much of this new
|
||||||
|
* request as possible, without exceeding MAX_CHUNK_SIZE.
|
||||||
|
*/
|
||||||
|
size_t thislen;
|
||||||
|
|
||||||
res = PQexec(src->conn, "COPY fetchchunks FROM STDIN");
|
thislen = Min(len, MAX_CHUNK_SIZE - prev->length);
|
||||||
if (PQresultStatus(res) != PGRES_COPY_IN)
|
prev->length += thislen;
|
||||||
pg_fatal("could not send file list: %s",
|
|
||||||
PQresultErrorMessage(res));
|
|
||||||
PQclear(res);
|
|
||||||
|
|
||||||
src->copy_started = true;
|
off += thislen;
|
||||||
|
len -= thislen;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Fall through to create new requests for any remaining 'len'
|
||||||
|
* that didn't fit in the previous chunk.
|
||||||
|
*/
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* Divide the request into pieces of MAX_CHUNK_SIZE bytes each */
|
||||||
* Write the file range to a temporary table in the server.
|
while (len > 0)
|
||||||
*
|
|
||||||
* The range is sent to the server as a COPY formatted line, to be inserted
|
|
||||||
* into the 'fetchchunks' temporary table. The libpq_finish_fetch() uses
|
|
||||||
* the temporary table to actually fetch the data.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* Split the range into CHUNKSIZE chunks */
|
|
||||||
while (end - begin > 0)
|
|
||||||
{
|
{
|
||||||
char linebuf[MAXPGPATH + 23];
|
int32 thislen;
|
||||||
unsigned int len;
|
|
||||||
|
|
||||||
/* Fine as long as CHUNKSIZE is not bigger than UINT32_MAX */
|
/* if the queue is full, perform all the work queued up so far */
|
||||||
if (end - begin > CHUNKSIZE)
|
if (src->num_requests == MAX_CHUNKS_PER_QUERY)
|
||||||
len = CHUNKSIZE;
|
process_queued_fetch_requests(src);
|
||||||
else
|
|
||||||
len = (unsigned int) (end - begin);
|
|
||||||
|
|
||||||
snprintf(linebuf, sizeof(linebuf), "%s\t" UINT64_FORMAT "\t%u\n", path, begin, len);
|
thislen = Min(len, MAX_CHUNK_SIZE);
|
||||||
|
src->request_queue[src->num_requests].path = path;
|
||||||
|
src->request_queue[src->num_requests].offset = off;
|
||||||
|
src->request_queue[src->num_requests].length = thislen;
|
||||||
|
src->num_requests++;
|
||||||
|
|
||||||
if (PQputCopyData(src->conn, linebuf, strlen(linebuf)) != 1)
|
off += thislen;
|
||||||
pg_fatal("could not send COPY data: %s",
|
len -= thislen;
|
||||||
PQerrorMessage(src->conn));
|
|
||||||
|
|
||||||
begin += len;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Receive all the queued chunks and write them to the target data directory.
|
* Fetch all the queued chunks and write them to the target data directory.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
libpq_finish_fetch(rewind_source *source)
|
libpq_finish_fetch(rewind_source *source)
|
||||||
{
|
{
|
||||||
libpq_source *src = (libpq_source *) source;
|
process_queued_fetch_requests((libpq_source *) source);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
process_queued_fetch_requests(libpq_source *src)
|
||||||
|
{
|
||||||
|
const char *params[3];
|
||||||
PGresult *res;
|
PGresult *res;
|
||||||
const char *sql;
|
int chunkno;
|
||||||
|
|
||||||
if (PQputCopyEnd(src->conn, NULL) != 1)
|
if (src->num_requests == 0)
|
||||||
pg_fatal("could not send end-of-COPY: %s",
|
return;
|
||||||
PQerrorMessage(src->conn));
|
|
||||||
|
|
||||||
while ((res = PQgetResult(src->conn)) != NULL)
|
pg_log_debug("getting %d file chunks", src->num_requests);
|
||||||
{
|
|
||||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
|
||||||
pg_fatal("unexpected result while sending file list: %s",
|
|
||||||
PQresultErrorMessage(res));
|
|
||||||
PQclear(res);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We've now copied the list of file ranges that we need to fetch to the
|
* The prepared statement, 'fetch_chunks_stmt', takes three arrays with
|
||||||
* temporary table. Now, actually fetch all of those ranges.
|
* the same length as parameters: paths, offsets and lengths. Construct
|
||||||
|
* the string representations of them.
|
||||||
*/
|
*/
|
||||||
sql =
|
resetStringInfo(&src->paths);
|
||||||
"SELECT path, begin,\n"
|
resetStringInfo(&src->offsets);
|
||||||
" pg_read_binary_file(path, begin, len, true) AS chunk\n"
|
resetStringInfo(&src->lengths);
|
||||||
"FROM fetchchunks\n";
|
|
||||||
|
|
||||||
if (PQsendQueryParams(src->conn, sql, 0, NULL, NULL, NULL, NULL, 1) != 1)
|
appendStringInfoChar(&src->paths, '{');
|
||||||
|
appendStringInfoChar(&src->offsets, '{');
|
||||||
|
appendStringInfoChar(&src->lengths, '{');
|
||||||
|
for (int i = 0; i < src->num_requests; i++)
|
||||||
|
{
|
||||||
|
fetch_range_request *rq = &src->request_queue[i];
|
||||||
|
|
||||||
|
if (i > 0)
|
||||||
|
{
|
||||||
|
appendStringInfoChar(&src->paths, ',');
|
||||||
|
appendStringInfoChar(&src->offsets, ',');
|
||||||
|
appendStringInfoChar(&src->lengths, ',');
|
||||||
|
}
|
||||||
|
|
||||||
|
appendArrayEscapedString(&src->paths, rq->path);
|
||||||
|
appendStringInfo(&src->offsets, INT64_FORMAT, (int64) rq->offset);
|
||||||
|
appendStringInfo(&src->lengths, INT64_FORMAT, (int64) rq->length);
|
||||||
|
}
|
||||||
|
appendStringInfoChar(&src->paths, '}');
|
||||||
|
appendStringInfoChar(&src->offsets, '}');
|
||||||
|
appendStringInfoChar(&src->lengths, '}');
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Execute the prepared statement.
|
||||||
|
*/
|
||||||
|
params[0] = src->paths.data;
|
||||||
|
params[1] = src->offsets.data;
|
||||||
|
params[2] = src->lengths.data;
|
||||||
|
|
||||||
|
if (PQsendQueryPrepared(src->conn, "fetch_chunks_stmt", 3, params, NULL, NULL, 1) != 1)
|
||||||
pg_fatal("could not send query: %s", PQerrorMessage(src->conn));
|
pg_fatal("could not send query: %s", PQerrorMessage(src->conn));
|
||||||
|
|
||||||
pg_log_debug("getting file chunks");
|
|
||||||
|
|
||||||
if (PQsetSingleRowMode(src->conn) != 1)
|
if (PQsetSingleRowMode(src->conn) != 1)
|
||||||
pg_fatal("could not set libpq connection to single row mode");
|
pg_fatal("could not set libpq connection to single row mode");
|
||||||
|
|
||||||
@ -382,8 +445,10 @@ libpq_finish_fetch(rewind_source *source)
|
|||||||
* chunk bytea -- file content
|
* chunk bytea -- file content
|
||||||
*----
|
*----
|
||||||
*/
|
*/
|
||||||
|
chunkno = 0;
|
||||||
while ((res = PQgetResult(src->conn)) != NULL)
|
while ((res = PQgetResult(src->conn)) != NULL)
|
||||||
{
|
{
|
||||||
|
fetch_range_request *rq = &src->request_queue[chunkno];
|
||||||
char *filename;
|
char *filename;
|
||||||
int filenamelen;
|
int filenamelen;
|
||||||
int64 chunkoff;
|
int64 chunkoff;
|
||||||
@ -404,6 +469,9 @@ libpq_finish_fetch(rewind_source *source)
|
|||||||
PQresultErrorMessage(res));
|
PQresultErrorMessage(res));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (chunkno > src->num_requests)
|
||||||
|
pg_fatal("received more data chunks than requested");
|
||||||
|
|
||||||
/* sanity check the result set */
|
/* sanity check the result set */
|
||||||
if (PQnfields(res) != 3 || PQntuples(res) != 1)
|
if (PQnfields(res) != 3 || PQntuples(res) != 1)
|
||||||
pg_fatal("unexpected result set size while fetching remote files");
|
pg_fatal("unexpected result set size while fetching remote files");
|
||||||
@ -448,31 +516,74 @@ libpq_finish_fetch(rewind_source *source)
|
|||||||
* If a file has been deleted on the source, remove it on the target
|
* If a file has been deleted on the source, remove it on the target
|
||||||
* as well. Note that multiple unlink() calls may happen on the same
|
* as well. Note that multiple unlink() calls may happen on the same
|
||||||
* file if multiple data chunks are associated with it, hence ignore
|
* file if multiple data chunks are associated with it, hence ignore
|
||||||
* unconditionally anything missing. If this file is not a relation
|
* unconditionally anything missing.
|
||||||
* data file, then it has been already truncated when creating the
|
|
||||||
* file chunk list at the previous execution of the filemap.
|
|
||||||
*/
|
*/
|
||||||
if (PQgetisnull(res, 0, 2))
|
if (PQgetisnull(res, 0, 2))
|
||||||
{
|
{
|
||||||
pg_log_debug("received null value for chunk for file \"%s\", file has been deleted",
|
pg_log_debug("received null value for chunk for file \"%s\", file has been deleted",
|
||||||
filename);
|
filename);
|
||||||
remove_target_file(filename, true);
|
remove_target_file(filename, true);
|
||||||
pg_free(filename);
|
|
||||||
PQclear(res);
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
pg_log_debug("received chunk for file \"%s\", offset " INT64_FORMAT ", size %d",
|
||||||
|
filename, chunkoff, chunksize);
|
||||||
|
|
||||||
pg_log_debug("received chunk for file \"%s\", offset " INT64_FORMAT ", size %d",
|
if (strcmp(filename, rq->path) != 0)
|
||||||
filename, chunkoff, chunksize);
|
{
|
||||||
|
pg_fatal("received data for file \"%s\", when requested for \"%s\"",
|
||||||
|
filename, rq->path);
|
||||||
|
}
|
||||||
|
if (chunkoff != rq->offset)
|
||||||
|
pg_fatal("received data at offset " INT64_FORMAT " of file \"%s\", when requested for offset " INT64_FORMAT,
|
||||||
|
chunkoff, rq->path, (int64) rq->offset);
|
||||||
|
|
||||||
open_target_file(filename, false);
|
/*
|
||||||
|
* We should not receive receive more data than we requested, or
|
||||||
|
* pg_read_binary_file() messed up. We could receive less,
|
||||||
|
* though, if the file was truncated in the source after we
|
||||||
|
* checked its size. That's OK, there should be a WAL record of
|
||||||
|
* the truncation, which will get replayed when you start the
|
||||||
|
* target system for the first time after pg_rewind has completed.
|
||||||
|
*/
|
||||||
|
if (chunksize > rq->length)
|
||||||
|
pg_fatal("received more than requested for file \"%s\"", rq->path);
|
||||||
|
|
||||||
write_target_range(chunk, chunkoff, chunksize);
|
open_target_file(filename, false);
|
||||||
|
|
||||||
|
write_target_range(chunk, chunkoff, chunksize);
|
||||||
|
}
|
||||||
|
|
||||||
pg_free(filename);
|
pg_free(filename);
|
||||||
|
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
|
chunkno++;
|
||||||
}
|
}
|
||||||
|
if (chunkno != src->num_requests)
|
||||||
|
pg_fatal("unexpected number of data chunks received");
|
||||||
|
|
||||||
|
src->num_requests = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Escape a string to be used as element in a text array constant
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
appendArrayEscapedString(StringInfo buf, const char *str)
|
||||||
|
{
|
||||||
|
appendStringInfoCharMacro(buf, '\"');
|
||||||
|
while (*str)
|
||||||
|
{
|
||||||
|
char ch = *str;
|
||||||
|
|
||||||
|
if (ch == '"' || ch == '\\')
|
||||||
|
appendStringInfoCharMacro(buf, '\\');
|
||||||
|
|
||||||
|
appendStringInfoCharMacro(buf, ch);
|
||||||
|
|
||||||
|
str++;
|
||||||
|
}
|
||||||
|
appendStringInfoCharMacro(buf, '\"');
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -521,6 +632,12 @@ libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize)
|
|||||||
static void
|
static void
|
||||||
libpq_destroy(rewind_source *source)
|
libpq_destroy(rewind_source *source)
|
||||||
{
|
{
|
||||||
pfree(source);
|
libpq_source *src = (libpq_source *) source;
|
||||||
|
|
||||||
|
pfree(src->paths.data);
|
||||||
|
pfree(src->offsets.data);
|
||||||
|
pfree(src->lengths.data);
|
||||||
|
pfree(src);
|
||||||
|
|
||||||
/* NOTE: we don't close the connection here, as it was not opened by us. */
|
/* NOTE: we don't close the connection here, as it was not opened by us. */
|
||||||
}
|
}
|
||||||
|
@ -50,6 +50,7 @@ static void disconnect_atexit(void);
|
|||||||
|
|
||||||
static ControlFileData ControlFile_target;
|
static ControlFileData ControlFile_target;
|
||||||
static ControlFileData ControlFile_source;
|
static ControlFileData ControlFile_source;
|
||||||
|
static ControlFileData ControlFile_source_after;
|
||||||
|
|
||||||
const char *progname;
|
const char *progname;
|
||||||
int WalSegSz;
|
int WalSegSz;
|
||||||
@ -486,6 +487,8 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
|
|||||||
XLogRecPtr endrec;
|
XLogRecPtr endrec;
|
||||||
TimeLineID endtli;
|
TimeLineID endtli;
|
||||||
ControlFileData ControlFile_new;
|
ControlFileData ControlFile_new;
|
||||||
|
size_t size;
|
||||||
|
char *buffer;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Execute the actions in the file map, fetching data from the source
|
* Execute the actions in the file map, fetching data from the source
|
||||||
@ -552,40 +555,104 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* Complete any remaining range-fetches that we queued up above. */
|
||||||
* We've now copied the list of file ranges that we need to fetch to the
|
|
||||||
* temporary table. Now, actually fetch all of those ranges.
|
|
||||||
*/
|
|
||||||
source->finish_fetch(source);
|
source->finish_fetch(source);
|
||||||
|
|
||||||
close_target_file();
|
close_target_file();
|
||||||
|
|
||||||
progress_report(true);
|
progress_report(true);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Fetch the control file from the source last. This ensures that the
|
||||||
|
* minRecoveryPoint is up-to-date.
|
||||||
|
*/
|
||||||
|
buffer = source->fetch_file(source, "global/pg_control", &size);
|
||||||
|
digestControlFile(&ControlFile_source_after, buffer, size);
|
||||||
|
pg_free(buffer);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Sanity check: If the source is a local system, the control file should
|
||||||
|
* not have changed since we started.
|
||||||
|
*
|
||||||
|
* XXX: We assume it hasn't been modified, but actually, what could go
|
||||||
|
* wrong? The logic handles a libpq source that's modified concurrently,
|
||||||
|
* why not a local datadir?
|
||||||
|
*/
|
||||||
|
if (datadir_source &&
|
||||||
|
memcmp(&ControlFile_source, &ControlFile_source_after,
|
||||||
|
sizeof(ControlFileData)) != 0)
|
||||||
|
{
|
||||||
|
pg_fatal("source system was modified while pg_rewind was running");
|
||||||
|
}
|
||||||
|
|
||||||
if (showprogress)
|
if (showprogress)
|
||||||
pg_log_info("creating backup label and updating control file");
|
pg_log_info("creating backup label and updating control file");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create a backup label file, to tell the target where to begin the WAL
|
||||||
|
* replay. Normally, from the last common checkpoint between the source
|
||||||
|
* and the target. But if the source is a standby server, it's possible
|
||||||
|
* that the last common checkpoint is *after* the standby's restartpoint.
|
||||||
|
* That implies that the source server has applied the checkpoint record,
|
||||||
|
* but hasn't perfomed a corresponding restartpoint yet. Make sure we
|
||||||
|
* start at the restartpoint's redo point in that case.
|
||||||
|
*
|
||||||
|
* Use the old version of the source's control file for this. The server
|
||||||
|
* might have finished the restartpoint after we started copying files,
|
||||||
|
* but we must begin from the redo point at the time that started copying.
|
||||||
|
*/
|
||||||
|
if (ControlFile_source.checkPointCopy.redo < chkptredo)
|
||||||
|
{
|
||||||
|
chkptredo = ControlFile_source.checkPointCopy.redo;
|
||||||
|
chkpttli = ControlFile_source.checkPointCopy.ThisTimeLineID;
|
||||||
|
chkptrec = ControlFile_source.checkPoint;
|
||||||
|
}
|
||||||
createBackupLabel(chkptredo, chkpttli, chkptrec);
|
createBackupLabel(chkptredo, chkpttli, chkptrec);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Update control file of target. Make it ready to perform archive
|
* Update control file of target, to tell the target how far it must
|
||||||
* recovery when restarting.
|
* replay the WAL (minRecoveryPoint).
|
||||||
*
|
|
||||||
* Like in an online backup, it's important that we replay all the WAL
|
|
||||||
* that was generated while we copied the files over. To enforce that, set
|
|
||||||
* 'minRecoveryPoint' in the control file.
|
|
||||||
*/
|
*/
|
||||||
memcpy(&ControlFile_new, &ControlFile_source, sizeof(ControlFileData));
|
|
||||||
|
|
||||||
if (connstr_source)
|
if (connstr_source)
|
||||||
{
|
{
|
||||||
endrec = source->get_current_wal_insert_lsn(source);
|
/*
|
||||||
endtli = ControlFile_source.checkPointCopy.ThisTimeLineID;
|
* The source is a live server. Like in an online backup, it's
|
||||||
|
* important that we recover all the WAL that was generated while we
|
||||||
|
* were copying files.
|
||||||
|
*/
|
||||||
|
if (ControlFile_source_after.state == DB_IN_ARCHIVE_RECOVERY)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Source is a standby server. We must replay to its
|
||||||
|
* minRecoveryPoint.
|
||||||
|
*/
|
||||||
|
endrec = ControlFile_source_after.minRecoveryPoint;
|
||||||
|
endtli = ControlFile_source_after.minRecoveryPointTLI;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Source is a production, non-standby, server. We must replay to
|
||||||
|
* the last WAL insert location.
|
||||||
|
*/
|
||||||
|
if (ControlFile_source_after.state != DB_IN_PRODUCTION)
|
||||||
|
pg_fatal("source system was in unexpected state at end of rewind");
|
||||||
|
|
||||||
|
endrec = source->get_current_wal_insert_lsn(source);
|
||||||
|
endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
endrec = ControlFile_source.checkPoint;
|
/*
|
||||||
endtli = ControlFile_source.checkPointCopy.ThisTimeLineID;
|
* Source is a local data directory. It should've shut down cleanly,
|
||||||
|
* and we must replay to the latest shutdown checkpoint.
|
||||||
|
*/
|
||||||
|
endrec = ControlFile_source_after.checkPoint;
|
||||||
|
endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
memcpy(&ControlFile_new, &ControlFile_source_after, sizeof(ControlFileData));
|
||||||
ControlFile_new.minRecoveryPoint = endrec;
|
ControlFile_new.minRecoveryPoint = endrec;
|
||||||
ControlFile_new.minRecoveryPointTLI = endtli;
|
ControlFile_new.minRecoveryPointTLI = endtli;
|
||||||
ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
|
ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
|
||||||
|
@ -40,10 +40,22 @@ sub run_test
|
|||||||
"in standby1";
|
"in standby1";
|
||||||
append_to_file "$test_standby_datadir/tst_standby_dir/standby_file2",
|
append_to_file "$test_standby_datadir/tst_standby_dir/standby_file2",
|
||||||
"in standby2";
|
"in standby2";
|
||||||
|
append_to_file
|
||||||
|
"$test_standby_datadir/tst_standby_dir/standby_file3 with 'quotes'",
|
||||||
|
"in standby3";
|
||||||
|
append_to_file
|
||||||
|
"$test_standby_datadir/tst_standby_dir/standby_file4 with double\"quote",
|
||||||
|
"in standby4";
|
||||||
|
append_to_file
|
||||||
|
"$test_standby_datadir/tst_standby_dir/standby_file5 with back\\slash",
|
||||||
|
"in standby5";
|
||||||
|
append_to_file
|
||||||
|
"$test_standby_datadir/tst_standby_dir/standby_file6_with_backslash\\\"and_double-quote",
|
||||||
|
"in standby6";
|
||||||
mkdir "$test_standby_datadir/tst_standby_dir/standby_subdir/";
|
mkdir "$test_standby_datadir/tst_standby_dir/standby_subdir/";
|
||||||
append_to_file
|
append_to_file
|
||||||
"$test_standby_datadir/tst_standby_dir/standby_subdir/standby_file3",
|
"$test_standby_datadir/tst_standby_dir/standby_subdir/standby_file7",
|
||||||
"in standby3";
|
"in standby7";
|
||||||
|
|
||||||
mkdir "$test_primary_datadir/tst_primary_dir";
|
mkdir "$test_primary_datadir/tst_primary_dir";
|
||||||
append_to_file "$test_primary_datadir/tst_primary_dir/primary_file1",
|
append_to_file "$test_primary_datadir/tst_primary_dir/primary_file1",
|
||||||
@ -58,7 +70,9 @@ sub run_test
|
|||||||
RewindTest::promote_standby();
|
RewindTest::promote_standby();
|
||||||
RewindTest::run_pg_rewind($test_mode);
|
RewindTest::run_pg_rewind($test_mode);
|
||||||
|
|
||||||
# List files in the data directory after rewind.
|
# List files in the data directory after rewind. All the files that
|
||||||
|
# were present in the standby should be present after rewind, and
|
||||||
|
# all the files that were added on the primary should be removed.
|
||||||
my @paths;
|
my @paths;
|
||||||
find(
|
find(
|
||||||
sub {
|
sub {
|
||||||
@ -78,8 +92,12 @@ sub run_test
|
|||||||
"$test_primary_datadir/tst_standby_dir",
|
"$test_primary_datadir/tst_standby_dir",
|
||||||
"$test_primary_datadir/tst_standby_dir/standby_file1",
|
"$test_primary_datadir/tst_standby_dir/standby_file1",
|
||||||
"$test_primary_datadir/tst_standby_dir/standby_file2",
|
"$test_primary_datadir/tst_standby_dir/standby_file2",
|
||||||
|
"$test_primary_datadir/tst_standby_dir/standby_file3 with 'quotes'",
|
||||||
|
"$test_primary_datadir/tst_standby_dir/standby_file4 with double\"quote",
|
||||||
|
"$test_primary_datadir/tst_standby_dir/standby_file5 with back\\slash",
|
||||||
|
"$test_primary_datadir/tst_standby_dir/standby_file6_with_backslash\\\"and_double-quote",
|
||||||
"$test_primary_datadir/tst_standby_dir/standby_subdir",
|
"$test_primary_datadir/tst_standby_dir/standby_subdir",
|
||||||
"$test_primary_datadir/tst_standby_dir/standby_subdir/standby_file3"
|
"$test_primary_datadir/tst_standby_dir/standby_subdir/standby_file7"
|
||||||
],
|
],
|
||||||
"file lists match");
|
"file lists match");
|
||||||
|
|
||||||
|
174
src/bin/pg_rewind/t/007_standby_source.pl
Normal file
174
src/bin/pg_rewind/t/007_standby_source.pl
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
#
|
||||||
|
# Test using a standby server as the source.
|
||||||
|
#
|
||||||
|
# This sets up three nodes: A, B and C. First, A is the primary,
|
||||||
|
# B follows A, and C follows B:
|
||||||
|
#
|
||||||
|
# A (primary) <--- B (standby) <--- C (standby)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Then we promote C, and insert some divergent rows in A and C:
|
||||||
|
#
|
||||||
|
# A (primary) <--- B (standby) C (primary)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Finally, we run pg_rewind on C, to re-point it at B again:
|
||||||
|
#
|
||||||
|
# A (primary) <--- B (standby) <--- C (standby)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# The test is similar to the basic tests, but since we're dealing with
|
||||||
|
# three nodes, not two, we cannot use most of the RewindTest functions
|
||||||
|
# as is.
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
use TestLib;
|
||||||
|
use Test::More tests => 3;
|
||||||
|
|
||||||
|
use FindBin;
|
||||||
|
use lib $FindBin::RealBin;
|
||||||
|
use File::Copy;
|
||||||
|
use PostgresNode;
|
||||||
|
use RewindTest;
|
||||||
|
|
||||||
|
my $tmp_folder = TestLib::tempdir;
|
||||||
|
|
||||||
|
my $node_a;
|
||||||
|
my $node_b;
|
||||||
|
my $node_c;
|
||||||
|
|
||||||
|
# Set up node A, as primary
|
||||||
|
#
|
||||||
|
# A (primary)
|
||||||
|
|
||||||
|
setup_cluster('a');
|
||||||
|
start_primary();
|
||||||
|
$node_a = $node_primary;
|
||||||
|
|
||||||
|
# Create a test table and insert a row in primary.
|
||||||
|
$node_a->safe_psql('postgres', "CREATE TABLE tbl1 (d text)");
|
||||||
|
$node_a->safe_psql('postgres', "INSERT INTO tbl1 VALUES ('in A')");
|
||||||
|
primary_psql("CHECKPOINT");
|
||||||
|
|
||||||
|
# Set up node B and C, as cascaded standbys
|
||||||
|
#
|
||||||
|
# A (primary) <--- B (standby) <--- C (standby)
|
||||||
|
$node_a->backup('my_backup');
|
||||||
|
$node_b = get_new_node('node_b');
|
||||||
|
$node_b->init_from_backup($node_a, 'my_backup', has_streaming => 1);
|
||||||
|
$node_b->set_standby_mode();
|
||||||
|
$node_b->start;
|
||||||
|
|
||||||
|
$node_b->backup('my_backup');
|
||||||
|
$node_c = get_new_node('node_c');
|
||||||
|
$node_c->init_from_backup($node_b, 'my_backup', has_streaming => 1);
|
||||||
|
$node_c->set_standby_mode();
|
||||||
|
$node_c->start;
|
||||||
|
|
||||||
|
# Insert additional data on A, and wait for both standbys to catch up.
|
||||||
|
$node_a->safe_psql('postgres',
|
||||||
|
"INSERT INTO tbl1 values ('in A, before promotion')");
|
||||||
|
$node_a->safe_psql('postgres', 'CHECKPOINT');
|
||||||
|
|
||||||
|
my $lsn = $node_a->lsn('insert');
|
||||||
|
$node_a->wait_for_catchup('node_b', 'write', $lsn);
|
||||||
|
$node_b->wait_for_catchup('node_c', 'write', $lsn);
|
||||||
|
|
||||||
|
# Promote C
|
||||||
|
#
|
||||||
|
# A (primary) <--- B (standby) C (primary)
|
||||||
|
|
||||||
|
$node_c->promote;
|
||||||
|
$node_c->safe_psql('postgres', "checkpoint");
|
||||||
|
|
||||||
|
|
||||||
|
# Insert a row in A. This causes A/B and C to have "diverged", so that it's
|
||||||
|
# no longer possible to just apply the standy's logs over primary directory
|
||||||
|
# - you need to rewind.
|
||||||
|
$node_a->safe_psql('postgres',
|
||||||
|
"INSERT INTO tbl1 VALUES ('in A, after C was promoted')");
|
||||||
|
|
||||||
|
# Also insert a new row in the standby, which won't be present in the
|
||||||
|
# old primary.
|
||||||
|
$node_c->safe_psql('postgres',
|
||||||
|
"INSERT INTO tbl1 VALUES ('in C, after C was promoted')");
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# All set up. We're ready to run pg_rewind.
|
||||||
|
#
|
||||||
|
my $node_c_pgdata = $node_c->data_dir;
|
||||||
|
|
||||||
|
# Stop the node and be ready to perform the rewind.
|
||||||
|
$node_c->stop('fast');
|
||||||
|
|
||||||
|
# Keep a temporary postgresql.conf or it would be overwritten during the rewind.
|
||||||
|
copy(
|
||||||
|
"$node_c_pgdata/postgresql.conf",
|
||||||
|
"$tmp_folder/node_c-postgresql.conf.tmp");
|
||||||
|
|
||||||
|
{
|
||||||
|
# Temporarily unset PGAPPNAME so that the server doesn't
|
||||||
|
# inherit it. Otherwise this could affect libpqwalreceiver
|
||||||
|
# connections in confusing ways.
|
||||||
|
local %ENV = %ENV;
|
||||||
|
delete $ENV{PGAPPNAME};
|
||||||
|
|
||||||
|
# Do rewind using a remote connection as source, generating
|
||||||
|
# recovery configuration automatically.
|
||||||
|
command_ok(
|
||||||
|
[
|
||||||
|
'pg_rewind', "--debug",
|
||||||
|
"--source-server", $node_b->connstr('postgres'),
|
||||||
|
"--target-pgdata=$node_c_pgdata", "--no-sync",
|
||||||
|
"--write-recovery-conf"
|
||||||
|
],
|
||||||
|
'pg_rewind remote');
|
||||||
|
}
|
||||||
|
|
||||||
|
# Now move back postgresql.conf with old settings
|
||||||
|
move(
|
||||||
|
"$tmp_folder/node_c-postgresql.conf.tmp",
|
||||||
|
"$node_c_pgdata/postgresql.conf");
|
||||||
|
|
||||||
|
# Restart the node.
|
||||||
|
$node_c->start;
|
||||||
|
|
||||||
|
# set RewindTest::node_primary to point to the rewinded node, so that we can
|
||||||
|
# use check_query()
|
||||||
|
$node_primary = $node_c;
|
||||||
|
|
||||||
|
# Run some checks to verify that C has been successfully rewound,
|
||||||
|
# and connected back to follow B.
|
||||||
|
|
||||||
|
check_query(
|
||||||
|
'SELECT * FROM tbl1',
|
||||||
|
qq(in A
|
||||||
|
in A, before promotion
|
||||||
|
in A, after C was promoted
|
||||||
|
),
|
||||||
|
'table content after rewind');
|
||||||
|
|
||||||
|
# Insert another row, and observe that it's cascaded from A to B to C.
|
||||||
|
$node_a->safe_psql('postgres',
|
||||||
|
"INSERT INTO tbl1 values ('in A, after rewind')");
|
||||||
|
|
||||||
|
$lsn = $node_a->lsn('insert');
|
||||||
|
$node_b->wait_for_catchup('node_c', 'write', $lsn);
|
||||||
|
|
||||||
|
check_query(
|
||||||
|
'SELECT * FROM tbl1',
|
||||||
|
qq(in A
|
||||||
|
in A, before promotion
|
||||||
|
in A, after C was promoted
|
||||||
|
in A, after rewind
|
||||||
|
),
|
||||||
|
'table content after rewind and insert');
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
$node_a->teardown_node;
|
||||||
|
$node_b->teardown_node;
|
||||||
|
$node_c->teardown_node;
|
||||||
|
|
||||||
|
exit(0);
|
Loading…
Reference in New Issue
Block a user