postgresql/contrib/pg_standby/pg_standby.c
Heikki Linnakangas 8fd733bd19 Disable pg_standby -l option because the backend doesn't expect the recovered
file to be a symlink. We tried to fix this issue with an earlier server-side
patch, but it didn't fix the whole issue.

The same bug is present in older releases as well, but the 8.4 train is
about to leave the station, and I'm not sure if have consensus on whether
we can remove the -l option in back-branches or do we need to attempt a
server-side fix to make symlinking safe.

Patch by Simon Riggs, per discussion on bug identified by Fujii Masao.
2009-06-25 12:03:11 +00:00

831 lines
22 KiB
C

/*
* $PostgreSQL: pgsql/contrib/pg_standby/pg_standby.c,v 1.25 2009/06/25 12:03:10 heikki Exp $
*
*
* pg_standby.c
*
* Production-ready example of how to create a Warm Standby
* database server using continuous archiving as a
* replication mechanism
*
* We separate the parameters for archive and nextWALfile
* so that we can check the archive exists, even if the
* WAL file doesn't (yet).
*
* This program will be executed once in full for each file
* requested by the warm standby server.
*
* It is designed to cater to a variety of needs, as well
* providing a customizable section.
*
* Original author: Simon Riggs simon@2ndquadrant.com
* Current maintainer: Simon Riggs
*/
#include "postgres_fe.h"
#include <ctype.h>
#include <dirent.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>
#ifdef WIN32
int getopt(int argc, char *const argv[], const char *optstring);
#else
#include <sys/time.h>
#include <unistd.h>
#ifdef HAVE_GETOPT_H
#include <getopt.h>
#endif
#endif /* ! WIN32 */
extern char *optarg;
extern int optind;
const char *progname;
/* Options and defaults */
int sleeptime = 5; /* amount of time to sleep between file checks */
int waittime = -1; /* how long we have been waiting, -1 no wait
* yet */
int maxwaittime = 0; /* how long are we prepared to wait for? */
int keepfiles = 0; /* number of WAL files to keep, 0 keep all */
int maxretries = 3; /* number of retries on restore command */
bool debug = false; /* are we debugging? */
bool need_cleanup = false; /* do we need to remove files from
* archive? */
static volatile sig_atomic_t signaled = false;
char *archiveLocation; /* where to find the archive? */
char *triggerPath; /* where to find the trigger file? */
char *xlogFilePath; /* where we are going to restore to */
char *nextWALFileName; /* the file we need to get from archive */
char *restartWALFileName; /* the file from which we can restart restore */
char *priorWALFileName; /* the file we need to get from archive */
char WALFilePath[MAXPGPATH]; /* the file path including archive */
char restoreCommand[MAXPGPATH]; /* run this to restore */
char exclusiveCleanupFileName[MAXPGPATH]; /* the file we need to
* get from archive */
/*
* Two types of failover are supported (smart and fast failover).
*
* The content of the trigger file determines the type of failover. If the
* trigger file contains the word "smart" (or the file is empty), smart
* failover is chosen: pg_standby acts as cp or ln command itself, on
* successful completion all the available WAL records will be applied
* resulting in zero data loss. But, it might take a long time to finish
* recovery if there's a lot of unapplied WAL.
*
* On the other hand, if the trigger file contains the word "fast", the
* recovery is finished immediately even if unapplied WAL files remain. Any
* transactions in the unapplied WAL files are lost.
*
* An empty trigger file performs smart failover. SIGUSR or SIGINT triggers
* fast failover. A timeout causes fast failover (smart failover would have
* the same effect, since if the timeout is reached there is no unapplied WAL).
*/
#define NoFailover 0
#define SmartFailover 1
#define FastFailover 2
static int Failover = NoFailover;
#define RESTORE_COMMAND_COPY 0
#define RESTORE_COMMAND_LINK 1
int restoreCommandType;
#define XLOG_DATA 0
#define XLOG_HISTORY 1
#define XLOG_BACKUP_LABEL 2
int nextWALFileType;
#define SET_RESTORE_COMMAND(cmd, arg1, arg2) \
snprintf(restoreCommand, MAXPGPATH, cmd " \"%s\" \"%s\"", arg1, arg2)
struct stat stat_buf;
/* =====================================================================
*
* Customizable section
*
* =====================================================================
*
* Currently, this section assumes that the Archive is a locally
* accessible directory. If you want to make other assumptions,
* such as using a vendor-specific archive and access API, these
* routines are the ones you'll need to change. You're
* enouraged to submit any changes to pgsql-hackers@postgresql.org
* or personally to the current maintainer. Those changes may be
* folded in to later versions of this program.
*/
#define XLOG_DATA_FNAME_LEN 24
/* Reworked from access/xlog_internal.h */
#define XLogFileName(fname, tli, log, seg) \
snprintf(fname, XLOG_DATA_FNAME_LEN + 1, "%08X%08X%08X", tli, log, seg)
/*
* Initialize allows customized commands into the warm standby program.
*
* As an example, and probably the common case, we use either
* cp/ln commands on *nix, or copy/move command on Windows.
*/
static void
CustomizableInitialize(void)
{
#ifdef WIN32
snprintf(WALFilePath, MAXPGPATH, "%s\\%s", archiveLocation, nextWALFileName);
switch (restoreCommandType)
{
case RESTORE_COMMAND_LINK:
SET_RESTORE_COMMAND("mklink", WALFilePath, xlogFilePath);
break;
case RESTORE_COMMAND_COPY:
default:
SET_RESTORE_COMMAND("copy", WALFilePath, xlogFilePath);
break;
}
#else
snprintf(WALFilePath, MAXPGPATH, "%s/%s", archiveLocation, nextWALFileName);
switch (restoreCommandType)
{
case RESTORE_COMMAND_LINK:
#if HAVE_WORKING_LINK
SET_RESTORE_COMMAND("ln -s -f", WALFilePath, xlogFilePath);
break;
#endif
case RESTORE_COMMAND_COPY:
default:
SET_RESTORE_COMMAND("cp", WALFilePath, xlogFilePath);
break;
}
#endif
/*
* This code assumes that archiveLocation is a directory You may wish to
* add code to check for tape libraries, etc.. So, since it is a
* directory, we use stat to test if its accessible
*/
if (stat(archiveLocation, &stat_buf) != 0)
{
fprintf(stderr, "%s: archiveLocation \"%s\" does not exist\n", progname, archiveLocation);
fflush(stderr);
exit(2);
}
}
/*
* CustomizableNextWALFileReady()
*
* Is the requested file ready yet?
*/
static bool
CustomizableNextWALFileReady()
{
if (stat(WALFilePath, &stat_buf) == 0)
{
/*
* If its a backup file, return immediately If its a regular file
* return only if its the right size already
*/
if (strlen(nextWALFileName) > 24 &&
strspn(nextWALFileName, "0123456789ABCDEF") == 24 &&
strcmp(nextWALFileName + strlen(nextWALFileName) - strlen(".backup"),
".backup") == 0)
{
nextWALFileType = XLOG_BACKUP_LABEL;
return true;
}
else if (stat_buf.st_size == XLOG_SEG_SIZE)
{
#ifdef WIN32
/*
* Windows 'cp' sets the final file size before the copy is
* complete, and not yet ready to be opened by pg_standby. So we
* wait for sleeptime secs before attempting to restore. If that
* is not enough, we will rely on the retry/holdoff mechanism.
* GNUWin32's cp does not have this problem.
*/
pg_usleep(sleeptime * 1000000L);
#endif
nextWALFileType = XLOG_DATA;
return true;
}
/*
* If still too small, wait until it is the correct size
*/
if (stat_buf.st_size > XLOG_SEG_SIZE)
{
if (debug)
{
fprintf(stderr, "file size greater than expected\n");
fflush(stderr);
}
exit(3);
}
}
return false;
}
#define MaxSegmentsPerLogFile ( 0xFFFFFFFF / XLOG_SEG_SIZE )
static void
CustomizableCleanupPriorWALFiles(void)
{
/*
* Work out name of prior file from current filename
*/
if (nextWALFileType == XLOG_DATA)
{
int rc;
DIR *xldir;
struct dirent *xlde;
/*
* Assume its OK to keep failing. The failure situation may change
* over time, so we'd rather keep going on the main processing than
* fail because we couldnt clean up yet.
*/
if ((xldir = opendir(archiveLocation)) != NULL)
{
while ((xlde = readdir(xldir)) != NULL)
{
/*
* We ignore the timeline part of the XLOG segment identifiers
* in deciding whether a segment is still needed. This
* ensures that we won't prematurely remove a segment from a
* parent timeline. We could probably be a little more
* proactive about removing segments of non-parent timelines,
* but that would be a whole lot more complicated.
*
* We use the alphanumeric sorting property of the filenames
* to decide which ones are earlier than the
* exclusiveCleanupFileName file. Note that this means files
* are not removed in the order they were originally written,
* in case this worries you.
*/
if (strlen(xlde->d_name) == XLOG_DATA_FNAME_LEN &&
strspn(xlde->d_name, "0123456789ABCDEF") == XLOG_DATA_FNAME_LEN &&
strcmp(xlde->d_name + 8, exclusiveCleanupFileName + 8) < 0)
{
#ifdef WIN32
snprintf(WALFilePath, MAXPGPATH, "%s\\%s", archiveLocation, xlde->d_name);
#else
snprintf(WALFilePath, MAXPGPATH, "%s/%s", archiveLocation, xlde->d_name);
#endif
if (debug)
fprintf(stderr, "\nremoving \"%s\"", WALFilePath);
rc = unlink(WALFilePath);
if (rc != 0)
{
fprintf(stderr, "\n%s: ERROR failed to remove \"%s\": %s",
progname, WALFilePath, strerror(errno));
break;
}
}
}
if (debug)
fprintf(stderr, "\n");
}
else
fprintf(stderr, "%s: archiveLocation \"%s\" open error\n", progname, archiveLocation);
closedir(xldir);
fflush(stderr);
}
}
/* =====================================================================
* End of Customizable section
* =====================================================================
*/
/*
* SetWALFileNameForCleanup()
*
* Set the earliest WAL filename that we want to keep on the archive
* and decide whether we need_cleanup
*/
static bool
SetWALFileNameForCleanup(void)
{
uint32 tli = 1,
log = 0,
seg = 0;
uint32 log_diff = 0,
seg_diff = 0;
bool cleanup = false;
if (restartWALFileName)
{
/*
* Don't do cleanup if the restartWALFileName provided is later than
* the xlog file requested. This is an error and we must not remove
* these files from archive. This shouldn't happen, but better safe
* than sorry.
*/
if (strcmp(restartWALFileName, nextWALFileName) > 0)
return false;
strcpy(exclusiveCleanupFileName, restartWALFileName);
return true;
}
if (keepfiles > 0)
{
sscanf(nextWALFileName, "%08X%08X%08X", &tli, &log, &seg);
if (tli > 0 && log >= 0 && seg > 0)
{
log_diff = keepfiles / MaxSegmentsPerLogFile;
seg_diff = keepfiles % MaxSegmentsPerLogFile;
if (seg_diff > seg)
{
log_diff++;
seg = MaxSegmentsPerLogFile - (seg_diff - seg);
}
else
seg -= seg_diff;
if (log >= log_diff)
{
log -= log_diff;
cleanup = true;
}
else
{
log = 0;
seg = 0;
}
}
}
XLogFileName(exclusiveCleanupFileName, tli, log, seg);
return cleanup;
}
/*
* CheckForExternalTrigger()
*
* Is there a trigger file? Sets global 'Failover' variable to indicate
* what kind of a trigger file it was. A "fast" trigger file is turned
* into a "smart" file as a side-effect.
*/
static void
CheckForExternalTrigger(void)
{
char buf[32];
int fd;
int len;
/*
* Look for a trigger file, if that option has been selected
*
* We use stat() here because triggerPath is always a file rather than
* potentially being in an archive
*/
if (!triggerPath || stat(triggerPath, &stat_buf) != 0)
return;
/*
* An empty trigger file performs smart failover. There's a little race
* condition here: if the writer of the trigger file has just created the
* file, but not yet written anything to it, we'll treat that as smart
* shutdown even if the other process was just about to write "fast" to
* it. But that's fine: we'll restore one more WAL file, and when we're
* invoked next time, we'll see the word "fast" and fail over immediately.
*/
if (stat_buf.st_size == 0)
{
Failover = SmartFailover;
fprintf(stderr, "trigger file found: smart failover\n");
fflush(stderr);
return;
}
if ((fd = open(triggerPath, O_RDWR, 0)) < 0)
{
fprintf(stderr, "WARNING: could not open \"%s\": %s\n",
triggerPath, strerror(errno));
fflush(stderr);
return;
}
if ((len = read(fd, buf, sizeof(buf))) < 0)
{
fprintf(stderr, "WARNING: could not read \"%s\": %s\n",
triggerPath, strerror(errno));
fflush(stderr);
close(fd);
return;
}
buf[len] = '\0';
if (strncmp(buf, "smart", 5) == 0)
{
Failover = SmartFailover;
fprintf(stderr, "trigger file found: smart failover\n");
fflush(stderr);
close(fd);
return;
}
if (strncmp(buf, "fast", 4) == 0)
{
Failover = FastFailover;
fprintf(stderr, "trigger file found: fast failover\n");
fflush(stderr);
/*
* Turn it into a "smart" trigger by truncating the file. Otherwise if
* the server asks us again to restore a segment that was restored
* already, we would return "not found" and upset the server.
*/
if (ftruncate(fd, 0) < 0)
{
fprintf(stderr, "WARNING: could not read \"%s\": %s\n",
triggerPath, strerror(errno));
fflush(stderr);
}
close(fd);
return;
}
close(fd);
fprintf(stderr, "WARNING: invalid content in \"%s\"\n", triggerPath);
fflush(stderr);
return;
}
/*
* RestoreWALFileForRecovery()
*
* Perform the action required to restore the file from archive
*/
static bool
RestoreWALFileForRecovery(void)
{
int rc = 0;
int numretries = 0;
if (debug)
{
fprintf(stderr, "running restore :");
fflush(stderr);
}
while (numretries <= maxretries)
{
rc = system(restoreCommand);
if (rc == 0)
{
if (debug)
{
fprintf(stderr, " OK\n");
fflush(stderr);
}
return true;
}
pg_usleep(numretries++ * sleeptime * 1000000L);
}
/*
* Allow caller to add additional info
*/
if (debug)
fprintf(stderr, "not restored\n");
return false;
}
static void
usage(void)
{
printf("%s allows PostgreSQL warm standby servers to be configured.\n\n", progname);
printf("Usage:\n");
printf(" %s [OPTION]... ARCHIVELOCATION NEXTWALFILE XLOGFILEPATH [RESTARTWALFILE]\n", progname);
printf("\n"
"with main intended use as a restore_command in the recovery.conf:\n"
" restore_command = 'pg_standby [OPTION]... ARCHIVELOCATION %%f %%p %%r'\n"
"e.g.\n"
" restore_command = 'pg_standby -l /mnt/server/archiverdir %%f %%p %%r'\n");
printf("\nOptions:\n");
printf(" -c copies file from archive (default)\n");
printf(" -d generate lots of debugging output (testing only)\n");
printf(" -k NUMFILESTOKEEP if RESTARTWALFILE not used, removes files prior to limit\n"
" (0 keeps all)\n");
printf(" -l does nothing; use of link is now deprecated\n");
printf(" -r MAXRETRIES max number of times to retry, with progressive wait\n"
" (default=3)\n");
printf(" -s SLEEPTIME seconds to wait between file checks (min=1, max=60,\n"
" default=5)\n");
printf(" -t TRIGGERFILE defines a trigger file to initiate failover (no default)\n");
printf(" -w MAXWAITTIME max seconds to wait for a file (0=no limit) (default=0)\n");
printf(" --help show this help, then exit\n");
printf(" --version output version information, then exit\n");
printf("\nReport bugs to <pgsql-bugs@postgresql.org>.\n");
}
static void
sighandler(int sig)
{
signaled = true;
}
#ifndef WIN32
/* We don't want SIGQUIT to core dump */
static void
sigquit_handler(int sig)
{
signal(SIGINT, SIG_DFL);
kill(getpid(), SIGINT);
}
#endif
/*------------ MAIN ----------------------------------------*/
int
main(int argc, char **argv)
{
int c;
progname = get_progname(argv[0]);
if (argc > 1)
{
if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
{
usage();
exit(0);
}
if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
{
puts("pg_standby (PostgreSQL) " PG_VERSION);
exit(0);
}
}
/*
* You can send SIGUSR1 to trigger failover.
*
* Postmaster uses SIGQUIT to request immediate shutdown. The default
* action is to core dump, but we don't want that, so trap it and commit
* suicide without core dump.
*
* We used to use SIGINT and SIGQUIT to trigger failover, but that turned
* out to be a bad idea because postmaster uses SIGQUIT to request
* immediate shutdown. We still trap SIGINT, but that may change in a
* future release.
*/
(void) signal(SIGUSR1, sighandler);
(void) signal(SIGINT, sighandler); /* deprecated, use SIGUSR1 */
#ifndef WIN32
(void) signal(SIGQUIT, sigquit_handler);
#endif
while ((c = getopt(argc, argv, "cdk:lr:s:t:w:")) != -1)
{
switch (c)
{
case 'c': /* Use copy */
restoreCommandType = RESTORE_COMMAND_COPY;
break;
case 'd': /* Debug mode */
debug = true;
break;
case 'k': /* keepfiles */
keepfiles = atoi(optarg);
if (keepfiles < 0)
{
fprintf(stderr, "%s: -k keepfiles must be >= 0\n", progname);
exit(2);
}
break;
case 'l': /* Use link */
/*
* Link feature disabled, possibly permanently. Linking
* causes a problem after recovery ends that is not currently
* resolved by PostgreSQL. 25 Jun 2009
restoreCommandType = RESTORE_COMMAND_LINK;
*/
break;
case 'r': /* Retries */
maxretries = atoi(optarg);
if (maxretries < 0)
{
fprintf(stderr, "%s: -r maxretries must be >= 0\n", progname);
exit(2);
}
break;
case 's': /* Sleep time */
sleeptime = atoi(optarg);
if (sleeptime <= 0 || sleeptime > 60)
{
fprintf(stderr, "%s: -s sleeptime incorrectly set\n", progname);
exit(2);
}
break;
case 't': /* Trigger file */
triggerPath = optarg;
break;
case 'w': /* Max wait time */
maxwaittime = atoi(optarg);
if (maxwaittime < 0)
{
fprintf(stderr, "%s: -w maxwaittime incorrectly set\n", progname);
exit(2);
}
break;
default:
fprintf(stderr, "Try \"%s --help\" for more information.\n", progname);
exit(2);
break;
}
}
/*
* Parameter checking - after checking to see if trigger file present
*/
if (argc == 1)
{
fprintf(stderr, "%s: not enough command-line arguments\n", progname);
exit(2);
}
/*
* We will go to the archiveLocation to get nextWALFileName.
* nextWALFileName may not exist yet, which would not be an error, so we
* separate the archiveLocation and nextWALFileName so we can check
* separately whether archiveLocation exists, if not that is an error
*/
if (optind < argc)
{
archiveLocation = argv[optind];
optind++;
}
else
{
fprintf(stderr, "%s: must specify archive location\n", progname);
fprintf(stderr, "Try \"%s --help\" for more information.\n", progname);
exit(2);
}
if (optind < argc)
{
nextWALFileName = argv[optind];
optind++;
}
else
{
fprintf(stderr, "%s: use %%f to specify nextWALFileName\n", progname);
fprintf(stderr, "Try \"%s --help\" for more information.\n", progname);
exit(2);
}
if (optind < argc)
{
xlogFilePath = argv[optind];
optind++;
}
else
{
fprintf(stderr, "%s: use %%p to specify xlogFilePath\n", progname);
fprintf(stderr, "Try \"%s --help\" for more information.\n", progname);
exit(2);
}
if (optind < argc)
{
restartWALFileName = argv[optind];
optind++;
}
CustomizableInitialize();
need_cleanup = SetWALFileNameForCleanup();
if (debug)
{
fprintf(stderr, "Trigger file : %s\n", triggerPath ? triggerPath : "<not set>");
fprintf(stderr, "Waiting for WAL file : %s\n", nextWALFileName);
fprintf(stderr, "WAL file path : %s\n", WALFilePath);
fprintf(stderr, "Restoring to : %s\n", xlogFilePath);
fprintf(stderr, "Sleep interval : %d second%s\n",
sleeptime, (sleeptime > 1 ? "s" : " "));
fprintf(stderr, "Max wait interval : %d %s\n",
maxwaittime, (maxwaittime > 0 ? "seconds" : "forever"));
fprintf(stderr, "Command for restore : %s\n", restoreCommand);
fprintf(stderr, "Keep archive history : ");
if (need_cleanup)
fprintf(stderr, "%s and later\n", exclusiveCleanupFileName);
else
fprintf(stderr, "No cleanup required\n");
fflush(stderr);
}
/*
* Check for initial history file: always the first file to be requested
* It's OK if the file isn't there - all other files need to wait
*/
if (strlen(nextWALFileName) > 8 &&
strspn(nextWALFileName, "0123456789ABCDEF") == 8 &&
strcmp(nextWALFileName + strlen(nextWALFileName) - strlen(".history"),
".history") == 0)
{
nextWALFileType = XLOG_HISTORY;
if (RestoreWALFileForRecovery())
exit(0);
else
{
if (debug)
{
fprintf(stderr, "history file not found\n");
fflush(stderr);
}
exit(1);
}
}
/*
* Main wait loop
*/
for (;;)
{
/* Check for trigger file or signal first */
CheckForExternalTrigger();
if (signaled)
{
Failover = FastFailover;
if (debug)
{
fprintf(stderr, "signaled to exit: fast failover\n");
fflush(stderr);
}
}
/*
* Check for fast failover immediately, before checking if the
* requested WAL file is available
*/
if (Failover == FastFailover)
exit(1);
if (CustomizableNextWALFileReady())
{
/*
* Once we have restored this file successfully we can remove some
* prior WAL files. If this restore fails we musn't remove any
* file because some of them will be requested again immediately
* after the failed restore, or when we restart recovery.
*/
if (RestoreWALFileForRecovery())
{
if (need_cleanup)
CustomizableCleanupPriorWALFiles();
exit(0);
}
else
{
/* Something went wrong in copying the file */
exit(1);
}
}
/* Check for smart failover if the next WAL file was not available */
if (Failover == SmartFailover)
exit(1);
if (sleeptime <= 60)
pg_usleep(sleeptime * 1000000L);
waittime += sleeptime;
if (waittime >= maxwaittime && maxwaittime > 0)
{
Failover = FastFailover;
if (debug)
{
fprintf(stderr, "Timed out after %d seconds: fast failover\n",
waittime);
fflush(stderr);
}
}
if (debug)
{
fprintf(stderr, "WAL file not present yet.");
if (triggerPath)
fprintf(stderr, " Checking for trigger file...");
fprintf(stderr, "\n");
fflush(stderr);
}
}
}