diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index 7af3b913ff..c4b8f15f65 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -366,6 +366,10 @@ start_postmaster(void) /* * Since there might be quotes to handle here, it is easier simply to pass * everything to a shell to process them. + * + * XXX it would be better to fork and exec so that we would know the + * child postmaster's PID directly; then test_postmaster_connection could + * use the PID without having to rely on reading it back from the pidfile. */ if (log_file != NULL) snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE, @@ -412,6 +416,8 @@ static PGPing test_postmaster_connection(bool do_checkpoint) { PGPing ret = PQPING_NO_RESPONSE; + bool found_stale_pidfile = false; + pgpid_t pm_pid = 0; char connstr[MAXPGPATH * 2 + 256]; int i; @@ -458,7 +464,7 @@ test_postmaster_connection(bool do_checkpoint) if (optlines[3] == NULL) { /* File is exactly three lines, must be pre-9.1 */ - write_stderr(_("%s: -w option is not supported when starting a pre-9.1 server\n"), + write_stderr(_("\n%s: -w option is not supported when starting a pre-9.1 server\n"), progname); return PQPING_NO_ATTEMPT; } @@ -466,70 +472,86 @@ test_postmaster_connection(bool do_checkpoint) optlines[5] != NULL) { /* File is complete enough for us, parse it */ + long pmpid; time_t pmstart; - int portnum; - char *sockdir; - char *hostaddr; - char host_str[MAXPGPATH]; /* - * Easy cross-check that we are looking at the right data - * directory: is the postmaster older than this execution - * of pg_ctl? Subtract 2 seconds to allow for possible - * clock skew between pg_ctl and the postmaster. + * Make sanity checks. If it's for a standalone backend + * (negative PID), or the recorded start time is before + * pg_ctl started, then either we are looking at the wrong + * data directory, or this is a pre-existing pidfile that + * hasn't (yet?) been overwritten by our child postmaster. + * Allow 2 seconds slop for possible cross-process clock + * skew. */ + pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]); pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]); - if (pmstart < start_time - 2) + if (pmpid <= 0 || pmstart < start_time - 2) { - write_stderr(_("%s: this data directory is running a pre-existing postmaster\n"), - progname); - return PQPING_NO_ATTEMPT; + /* + * Set flag to report stale pidfile if it doesn't + * get overwritten before we give up waiting. + */ + found_stale_pidfile = true; } - - /* - * OK, extract port number and host string to use. Prefer - * using Unix socket if available. - */ - portnum = atoi(optlines[LOCK_FILE_LINE_PORT - 1]); - - sockdir = optlines[LOCK_FILE_LINE_SOCKET_DIR - 1]; - hostaddr = optlines[LOCK_FILE_LINE_LISTEN_ADDR - 1]; - - /* - * While unix_socket_directory can accept relative - * directories, libpq's host parameter must have a leading - * slash to indicate a socket directory. So, ignore - * sockdir if it's relative, and try to use TCP instead. - */ - if (sockdir[0] == '/') - strlcpy(host_str, sockdir, sizeof(host_str)); else - strlcpy(host_str, hostaddr, sizeof(host_str)); - - /* remove trailing newline */ - if (strchr(host_str, '\n') != NULL) - *strchr(host_str, '\n') = '\0'; - - /* Fail if we couldn't get either sockdir or host addr */ - if (host_str[0] == '\0') { - write_stderr(_("%s: -w option cannot use a relative socket directory specification\n"), - progname); - return PQPING_NO_ATTEMPT; + /* + * OK, seems to be a valid pidfile from our child. + */ + int portnum; + char *sockdir; + char *hostaddr; + char host_str[MAXPGPATH]; + + found_stale_pidfile = false; + pm_pid = (pgpid_t) pmpid; + + /* + * Extract port number and host string to use. Prefer + * using Unix socket if available. + */ + portnum = atoi(optlines[LOCK_FILE_LINE_PORT - 1]); + sockdir = optlines[LOCK_FILE_LINE_SOCKET_DIR - 1]; + hostaddr = optlines[LOCK_FILE_LINE_LISTEN_ADDR - 1]; + + /* + * While unix_socket_directory can accept relative + * directories, libpq's host parameter must have a + * leading slash to indicate a socket directory. So, + * ignore sockdir if it's relative, and try to use TCP + * instead. + */ + if (sockdir[0] == '/') + strlcpy(host_str, sockdir, sizeof(host_str)); + else + strlcpy(host_str, hostaddr, sizeof(host_str)); + + /* remove trailing newline */ + if (strchr(host_str, '\n') != NULL) + *strchr(host_str, '\n') = '\0'; + + /* Fail if couldn't get either sockdir or host addr */ + if (host_str[0] == '\0') + { + write_stderr(_("\n%s: -w option cannot use a relative socket directory specification\n"), + progname); + return PQPING_NO_ATTEMPT; + } + + /* If postmaster is listening on "*", use localhost */ + if (strcmp(host_str, "*") == 0) + strcpy(host_str, "localhost"); + + /* + * We need to set connect_timeout otherwise on Windows + * the Service Control Manager (SCM) will probably + * timeout first. + */ + snprintf(connstr, sizeof(connstr), + "dbname=postgres port=%d host='%s' connect_timeout=5", + portnum, host_str); } - - /* If postmaster is listening on "*", use "localhost" */ - if (strcmp(host_str, "*") == 0) - strcpy(host_str, "localhost"); - - /* - * We need to set connect_timeout otherwise on Windows the - * Service Control Manager (SCM) will probably timeout - * first. - */ - snprintf(connstr, sizeof(connstr), - "dbname=postgres port=%d host='%s' connect_timeout=5", - portnum, host_str); } } } @@ -545,7 +567,11 @@ test_postmaster_connection(bool do_checkpoint) /* * The postmaster should create postmaster.pid very soon after being * started. If it's not there after we've waited 5 or more seconds, - * assume startup failed and give up waiting. + * assume startup failed and give up waiting. (Note this covers + * both cases where the pidfile was never created, and where it was + * created and then removed during postmaster exit.) Also, if there + * *is* a file there but it appears stale, issue a suitable warning + * and give up waiting. */ if (i >= 5) { @@ -553,8 +579,24 @@ test_postmaster_connection(bool do_checkpoint) if (stat(pid_file, &statbuf) != 0) return PQPING_NO_RESPONSE; + + if (found_stale_pidfile) + { + write_stderr(_("\n%s: this data directory appears to be running a pre-existing postmaster\n"), + progname); + return PQPING_NO_RESPONSE; + } } + /* + * If we've been able to identify the child postmaster's PID, check + * the process is still alive. This covers cases where the postmaster + * successfully created the pidfile but then crashed without removing + * it. + */ + if (pm_pid > 0 && !postmaster_is_alive((pid_t) pm_pid)) + return PQPING_NO_RESPONSE; + /* No response, or startup still in process; wait */ #if defined(WIN32) if (do_checkpoint) @@ -715,7 +757,6 @@ do_init(void) static void do_start(void) { - pgpid_t pid; pgpid_t old_pid = 0; int exitcode; @@ -765,19 +806,6 @@ do_start(void) exit(1); } - if (old_pid != 0) - { - pg_usleep(1000000); - pid = get_pgpid(); - if (pid == old_pid) - { - write_stderr(_("%s: could not start server\n" - "Examine the log output.\n"), - progname); - exit(1); - } - } - if (do_wait) { print_msg(_("waiting for server to start..."));