Further improvements in pg_ctl's new wait-for-postmaster-start logic.

Add a postmaster_is_alive() test to the wait loop, so that we stop waiting
if the postmaster dies without removing its pidfile.  Unfortunately this
only helps after the postmaster has created its pidfile, since until then
we don't know which PID to check.  But if it never does create the pidfile,
we can give up in a relatively short time, so this is a useful addition
in practice.  Per suggestion from Fujii Masao, though this doesn't look
very much like his patch.

In addition, improve pg_ctl's ability to cope with pre-existing pidfiles.
Such a file might or might not represent a live postmaster that is going to
block our postmaster from starting, but the previous code pre-judged the
situation and gave up waiting immediately.  Now, we will wait for up to 5
seconds to see if our postmaster overwrites such a file.  This issue
interacts with Fujii's patch because we would make the wrong conclusion
if we did the postmaster_is_alive() test with a pre-existing PID.

All of this could be improved if we rewrote start_postmaster() so that it
could report the child postmaster's PID, so that we'd know a-priori the
correct PID to test with postmaster_is_alive().  That looks like a bit too
much change for so late in the 9.1 development cycle, unfortunately.
This commit is contained in:
Tom Lane 2011-06-01 13:09:07 -04:00
parent 6923d699bc
commit 3c485ca8e6

View File

@ -366,6 +366,10 @@ start_postmaster(void)
/*
* Since there might be quotes to handle here, it is easier simply to pass
* everything to a shell to process them.
*
* XXX it would be better to fork and exec so that we would know the
* child postmaster's PID directly; then test_postmaster_connection could
* use the PID without having to rely on reading it back from the pidfile.
*/
if (log_file != NULL)
snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE,
@ -412,6 +416,8 @@ static PGPing
test_postmaster_connection(bool do_checkpoint)
{
PGPing ret = PQPING_NO_RESPONSE;
bool found_stale_pidfile = false;
pgpid_t pm_pid = 0;
char connstr[MAXPGPATH * 2 + 256];
int i;
@ -458,7 +464,7 @@ test_postmaster_connection(bool do_checkpoint)
if (optlines[3] == NULL)
{
/* File is exactly three lines, must be pre-9.1 */
write_stderr(_("%s: -w option is not supported when starting a pre-9.1 server\n"),
write_stderr(_("\n%s: -w option is not supported when starting a pre-9.1 server\n"),
progname);
return PQPING_NO_ATTEMPT;
}
@ -466,40 +472,55 @@ test_postmaster_connection(bool do_checkpoint)
optlines[5] != NULL)
{
/* File is complete enough for us, parse it */
long pmpid;
time_t pmstart;
/*
* Make sanity checks. If it's for a standalone backend
* (negative PID), or the recorded start time is before
* pg_ctl started, then either we are looking at the wrong
* data directory, or this is a pre-existing pidfile that
* hasn't (yet?) been overwritten by our child postmaster.
* Allow 2 seconds slop for possible cross-process clock
* skew.
*/
pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]);
pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]);
if (pmpid <= 0 || pmstart < start_time - 2)
{
/*
* Set flag to report stale pidfile if it doesn't
* get overwritten before we give up waiting.
*/
found_stale_pidfile = true;
}
else
{
/*
* OK, seems to be a valid pidfile from our child.
*/
int portnum;
char *sockdir;
char *hostaddr;
char host_str[MAXPGPATH];
/*
* Easy cross-check that we are looking at the right data
* directory: is the postmaster older than this execution
* of pg_ctl? Subtract 2 seconds to allow for possible
* clock skew between pg_ctl and the postmaster.
*/
pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]);
if (pmstart < start_time - 2)
{
write_stderr(_("%s: this data directory is running a pre-existing postmaster\n"),
progname);
return PQPING_NO_ATTEMPT;
}
found_stale_pidfile = false;
pm_pid = (pgpid_t) pmpid;
/*
* OK, extract port number and host string to use. Prefer
* Extract port number and host string to use. Prefer
* using Unix socket if available.
*/
portnum = atoi(optlines[LOCK_FILE_LINE_PORT - 1]);
sockdir = optlines[LOCK_FILE_LINE_SOCKET_DIR - 1];
hostaddr = optlines[LOCK_FILE_LINE_LISTEN_ADDR - 1];
/*
* While unix_socket_directory can accept relative
* directories, libpq's host parameter must have a leading
* slash to indicate a socket directory. So, ignore
* sockdir if it's relative, and try to use TCP instead.
* directories, libpq's host parameter must have a
* leading slash to indicate a socket directory. So,
* ignore sockdir if it's relative, and try to use TCP
* instead.
*/
if (sockdir[0] == '/')
strlcpy(host_str, sockdir, sizeof(host_str));
@ -510,22 +531,22 @@ test_postmaster_connection(bool do_checkpoint)
if (strchr(host_str, '\n') != NULL)
*strchr(host_str, '\n') = '\0';
/* Fail if we couldn't get either sockdir or host addr */
/* Fail if couldn't get either sockdir or host addr */
if (host_str[0] == '\0')
{
write_stderr(_("%s: -w option cannot use a relative socket directory specification\n"),
write_stderr(_("\n%s: -w option cannot use a relative socket directory specification\n"),
progname);
return PQPING_NO_ATTEMPT;
}
/* If postmaster is listening on "*", use "localhost" */
/* If postmaster is listening on "*", use localhost */
if (strcmp(host_str, "*") == 0)
strcpy(host_str, "localhost");
/*
* We need to set connect_timeout otherwise on Windows the
* Service Control Manager (SCM) will probably timeout
* first.
* We need to set connect_timeout otherwise on Windows
* the Service Control Manager (SCM) will probably
* timeout first.
*/
snprintf(connstr, sizeof(connstr),
"dbname=postgres port=%d host='%s' connect_timeout=5",
@ -533,6 +554,7 @@ test_postmaster_connection(bool do_checkpoint)
}
}
}
}
/* If we have a connection string, ping the server */
if (connstr[0] != '\0')
@ -545,7 +567,11 @@ test_postmaster_connection(bool do_checkpoint)
/*
* The postmaster should create postmaster.pid very soon after being
* started. If it's not there after we've waited 5 or more seconds,
* assume startup failed and give up waiting.
* assume startup failed and give up waiting. (Note this covers
* both cases where the pidfile was never created, and where it was
* created and then removed during postmaster exit.) Also, if there
* *is* a file there but it appears stale, issue a suitable warning
* and give up waiting.
*/
if (i >= 5)
{
@ -553,7 +579,23 @@ test_postmaster_connection(bool do_checkpoint)
if (stat(pid_file, &statbuf) != 0)
return PQPING_NO_RESPONSE;
if (found_stale_pidfile)
{
write_stderr(_("\n%s: this data directory appears to be running a pre-existing postmaster\n"),
progname);
return PQPING_NO_RESPONSE;
}
}
/*
* If we've been able to identify the child postmaster's PID, check
* the process is still alive. This covers cases where the postmaster
* successfully created the pidfile but then crashed without removing
* it.
*/
if (pm_pid > 0 && !postmaster_is_alive((pid_t) pm_pid))
return PQPING_NO_RESPONSE;
/* No response, or startup still in process; wait */
#if defined(WIN32)
@ -715,7 +757,6 @@ do_init(void)
static void
do_start(void)
{
pgpid_t pid;
pgpid_t old_pid = 0;
int exitcode;
@ -765,19 +806,6 @@ do_start(void)
exit(1);
}
if (old_pid != 0)
{
pg_usleep(1000000);
pid = get_pgpid();
if (pid == old_pid)
{
write_stderr(_("%s: could not start server\n"
"Examine the log output.\n"),
progname);
exit(1);
}
}
if (do_wait)
{
print_msg(_("waiting for server to start..."));