mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-12-15 08:20:16 +08:00
Fix fsync-at-startup code to not treat errors as fatal.
Commit 2ce439f337
introduced a rather serious
regression, namely that if its scan of the data directory came across any
un-fsync-able files, it would fail and thereby prevent database startup.
Worse yet, symlinks to such files also caused the problem, which meant that
crash restart was guaranteed to fail on certain common installations such
as older Debian.
After discussion, we agreed that (1) failure to start is worse than any
consequence of not fsync'ing is likely to be, therefore treat all errors
in this code as nonfatal; (2) we should not chase symlinks other than
those that are expected to exist, namely pg_xlog/ and tablespace links
under pg_tblspc/. The latter restriction avoids possibly fsync'ing a
much larger part of the filesystem than intended, if the user has left
random symlinks hanging about in the data directory.
This commit takes care of that and also does some code beautification,
mainly moving the relevant code into fd.c, which seems a much better place
for it than xlog.c, and making sure that the conditional compilation for
the pre_sync_fname pass has something to do with whether pg_flush_data
works.
I also relocated the call site in xlog.c down a few lines; it seems a
bit silly to be doing this before ValidateXLOGDirectoryStructure().
The similar logic in initdb.c ought to be made to match this, but that
change is noncritical and will be dealt with separately.
Back-patch to all active branches, like the prior commit.
Abhijit Menon-Sen and Tom Lane
This commit is contained in:
parent
b06649b7fe
commit
e4a04e5a4e
@ -628,8 +628,6 @@ static bool read_backup_label(XLogRecPtr *checkPointLoc);
|
||||
static void rm_redo_error_callback(void *arg);
|
||||
static int get_sync_bit(int method);
|
||||
|
||||
static void fsync_pgdata(char *datadir);
|
||||
|
||||
/*
|
||||
* Insert an XLOG record having the specified RMID and info bytes,
|
||||
* with the body of the record being the data chunk(s) described by
|
||||
@ -5925,18 +5923,6 @@ StartupXLOG(void)
|
||||
(errmsg("database system was interrupted; last known up at %s",
|
||||
str_time(ControlFile->time))));
|
||||
|
||||
/*
|
||||
* If we previously crashed, there might be data which we had written,
|
||||
* intending to fsync it, but which we had not actually fsync'd yet.
|
||||
* Therefore, a power failure in the near future might cause earlier
|
||||
* unflushed writes to be lost, even though more recent data written to
|
||||
* disk from here on would be persisted. To avoid that, fsync the entire
|
||||
* data directory.
|
||||
*/
|
||||
if (ControlFile->state != DB_SHUTDOWNED &&
|
||||
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
|
||||
fsync_pgdata(data_directory);
|
||||
|
||||
/* This is just to allow attaching to startup process with a debugger */
|
||||
#ifdef XLOG_REPLAY_DELAY
|
||||
if (ControlFile->state != DB_SHUTDOWNED)
|
||||
@ -5960,6 +5946,18 @@ StartupXLOG(void)
|
||||
*/
|
||||
RelationCacheInitFileRemove();
|
||||
|
||||
/*
|
||||
* If we previously crashed, there might be data which we had written,
|
||||
* intending to fsync it, but which we had not actually fsync'd yet.
|
||||
* Therefore, a power failure in the near future might cause earlier
|
||||
* unflushed writes to be lost, even though more recent data written to
|
||||
* disk from here on would be persisted. To avoid that, fsync the entire
|
||||
* data directory.
|
||||
*/
|
||||
if (ControlFile->state != DB_SHUTDOWNED &&
|
||||
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
|
||||
SyncDataDirectory();
|
||||
|
||||
/*
|
||||
* Initialize on the assumption we want to recover to the same timeline
|
||||
* that's active according to pg_control.
|
||||
@ -9980,31 +9978,3 @@ CheckForStandbyTrigger(void)
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Issue fsync recursively on PGDATA and all its contents.
|
||||
*/
|
||||
static void
|
||||
fsync_pgdata(char *datadir)
|
||||
{
|
||||
if (!enableFsync)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If possible, hint to the kernel that we're soon going to fsync
|
||||
* the data directory and its contents.
|
||||
*/
|
||||
#if defined(HAVE_SYNC_FILE_RANGE) || \
|
||||
(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
|
||||
walkdir(datadir, pre_sync_fname);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Now we do the fsync()s in the same order.
|
||||
*
|
||||
* It's important to fsync the destination directory itself as individual
|
||||
* file fsyncs don't guarantee that the directory entry for the file is
|
||||
* synced.
|
||||
*/
|
||||
walkdir(datadir, fsync_fname);
|
||||
}
|
||||
|
@ -59,6 +59,11 @@
|
||||
#include "utils/resowner.h"
|
||||
|
||||
|
||||
/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
|
||||
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
|
||||
#define PG_FLUSH_DATA_WORKS 1
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We must leave some file descriptors free for system(), the dynamic loader,
|
||||
* and other code that tries to open files without consulting fd.c. This
|
||||
@ -252,10 +257,21 @@ static void FreeVfd(File file);
|
||||
|
||||
static int FileAccess(File file);
|
||||
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
|
||||
static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel);
|
||||
|
||||
static void AtProcExit_Files(int code, Datum arg);
|
||||
static void CleanupTempFiles(bool isProcExit);
|
||||
static void RemovePgTempFilesInDir(const char *tmpdirname);
|
||||
|
||||
static void walkdir(const char *path,
|
||||
void (*action) (const char *fname, bool isdir, int elevel),
|
||||
bool process_symlinks,
|
||||
int elevel);
|
||||
#ifdef PG_FLUSH_DATA_WORKS
|
||||
static void pre_sync_fname(const char *fname, bool isdir, int elevel);
|
||||
#endif
|
||||
static void fsync_fname_ext(const char *fname, bool isdir, int elevel);
|
||||
|
||||
|
||||
/*
|
||||
* pg_fsync --- do fsync with or without writethrough
|
||||
@ -331,15 +347,24 @@ pg_fdatasync(int fd)
|
||||
* pg_flush_data --- advise OS that the data described won't be needed soon
|
||||
*
|
||||
* Not all platforms have posix_fadvise; treat as noop if not available.
|
||||
* Also, treat as no-op if enableFsync is off; this is because the call isn't
|
||||
* free, and some platforms such as Linux will actually block the requestor
|
||||
* until the write is scheduled.
|
||||
*/
|
||||
int
|
||||
pg_flush_data(int fd, off_t offset, off_t amount)
|
||||
{
|
||||
#ifdef PG_FLUSH_DATA_WORKS
|
||||
if (enableFsync)
|
||||
{
|
||||
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
|
||||
return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
|
||||
return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
|
||||
#else
|
||||
return 0;
|
||||
#error PG_FLUSH_DATA_WORKS should not have been defined
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -1599,15 +1624,28 @@ TryAgain:
|
||||
*/
|
||||
struct dirent *
|
||||
ReadDir(DIR *dir, const char *dirname)
|
||||
{
|
||||
return ReadDirExtended(dir, dirname, ERROR);
|
||||
}
|
||||
|
||||
/*
|
||||
* Alternate version that allows caller to specify the elevel for any
|
||||
* error report. If elevel < ERROR, returns NULL on any error.
|
||||
*/
|
||||
static struct dirent *
|
||||
ReadDirExtended(DIR *dir, const char *dirname, int elevel)
|
||||
{
|
||||
struct dirent *dent;
|
||||
|
||||
/* Give a generic message for AllocateDir failure, if caller didn't */
|
||||
if (dir == NULL)
|
||||
ereport(ERROR,
|
||||
{
|
||||
ereport(elevel,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not open directory \"%s\": %m",
|
||||
dirname)));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
errno = 0;
|
||||
if ((dent = readdir(dir)) != NULL)
|
||||
@ -1620,7 +1658,7 @@ ReadDir(DIR *dir, const char *dirname)
|
||||
#endif
|
||||
|
||||
if (errno)
|
||||
ereport(ERROR,
|
||||
ereport(elevel,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not read directory \"%s\": %m",
|
||||
dirname)));
|
||||
@ -2005,54 +2043,121 @@ fsync_fname(char *fname, bool isdir)
|
||||
close(fd);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Hint to the OS that it should get ready to fsync() this file.
|
||||
* Issue fsync recursively on PGDATA and all its contents.
|
||||
*
|
||||
* Adapted from pre_sync_fname in initdb.c
|
||||
* We fsync regular files and directories wherever they are, but we
|
||||
* follow symlinks only for pg_xlog and immediately under pg_tblspc.
|
||||
* Other symlinks are presumed to point at files we're not responsible
|
||||
* for fsyncing, and might not have privileges to write at all.
|
||||
*
|
||||
* Errors are logged but not considered fatal; that's because this is used
|
||||
* only during database startup, to deal with the possibility that there are
|
||||
* issued-but-unsynced writes pending against the data directory. We want to
|
||||
* ensure that such writes reach disk before anything that's done in the new
|
||||
* run. However, aborting on error would result in failure to start for
|
||||
* harmless cases such as read-only files in the data directory, and that's
|
||||
* not good either.
|
||||
*
|
||||
* Note we assume we're chdir'd into PGDATA to begin with.
|
||||
*/
|
||||
void
|
||||
pre_sync_fname(char *fname, bool isdir)
|
||||
SyncDataDirectory(void)
|
||||
{
|
||||
int fd;
|
||||
bool xlog_is_symlink;
|
||||
|
||||
fd = BasicOpenFile(fname, O_RDONLY | PG_BINARY, 0);
|
||||
|
||||
/*
|
||||
* Some OSs don't allow us to open directories at all (Windows returns
|
||||
* EACCES)
|
||||
*/
|
||||
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
|
||||
/* We can skip this whole thing if fsync is disabled. */
|
||||
if (!enableFsync)
|
||||
return;
|
||||
|
||||
if (fd < 0)
|
||||
ereport(FATAL,
|
||||
(errmsg("could not open file \"%s\": %m",
|
||||
fname)));
|
||||
/*
|
||||
* If pg_xlog is a symlink, we'll need to recurse into it separately,
|
||||
* because the first walkdir below will ignore it.
|
||||
*/
|
||||
xlog_is_symlink = false;
|
||||
|
||||
pg_flush_data(fd, 0, 0);
|
||||
#ifndef WIN32
|
||||
{
|
||||
struct stat st;
|
||||
|
||||
close(fd);
|
||||
if (lstat("pg_xlog", &st) < 0)
|
||||
ereport(LOG,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not stat file \"%s\": %m",
|
||||
"pg_xlog")));
|
||||
else if (S_ISLNK(st.st_mode))
|
||||
xlog_is_symlink = true;
|
||||
}
|
||||
#else
|
||||
if (pgwin32_is_junction("pg_xlog"))
|
||||
xlog_is_symlink = true;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If possible, hint to the kernel that we're soon going to fsync the data
|
||||
* directory and its contents. Errors in this step are even less
|
||||
* interesting than normal, so log them only at DEBUG1.
|
||||
*/
|
||||
#ifdef PG_FLUSH_DATA_WORKS
|
||||
walkdir(".", pre_sync_fname, false, DEBUG1);
|
||||
if (xlog_is_symlink)
|
||||
walkdir("pg_xlog", pre_sync_fname, false, DEBUG1);
|
||||
walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Now we do the fsync()s in the same order.
|
||||
*
|
||||
* The main call ignores symlinks, so in addition to specially processing
|
||||
* pg_xlog if it's a symlink, pg_tblspc has to be visited separately with
|
||||
* process_symlinks = true. Note that if there are any plain directories
|
||||
* in pg_tblspc, they'll get fsync'd twice. That's not an expected case
|
||||
* so we don't worry about optimizing it.
|
||||
*/
|
||||
walkdir(".", fsync_fname_ext, false, LOG);
|
||||
if (xlog_is_symlink)
|
||||
walkdir("pg_xlog", fsync_fname_ext, false, LOG);
|
||||
walkdir("pg_tblspc", fsync_fname_ext, true, LOG);
|
||||
}
|
||||
|
||||
/*
|
||||
* walkdir: recursively walk a directory, applying the action to each
|
||||
* regular file and directory (including the named directory itself)
|
||||
* and following symbolic links.
|
||||
* regular file and directory (including the named directory itself).
|
||||
*
|
||||
* NB: There is another version of walkdir in initdb.c, but that version
|
||||
* behaves differently with respect to symbolic links. Caveat emptor!
|
||||
* If process_symlinks is true, the action and recursion are also applied
|
||||
* to regular files and directories that are pointed to by symlinks in the
|
||||
* given directory; otherwise symlinks are ignored. Symlinks are always
|
||||
* ignored in subdirectories, ie we intentionally don't pass down the
|
||||
* process_symlinks flag to recursive calls.
|
||||
*
|
||||
* Errors are reported at level elevel, which might be ERROR or less.
|
||||
*
|
||||
* See also walkdir in initdb.c, which is a frontend version of this logic.
|
||||
*/
|
||||
void
|
||||
walkdir(char *path, void (*action) (char *fname, bool isdir))
|
||||
static void
|
||||
walkdir(const char *path,
|
||||
void (*action) (const char *fname, bool isdir, int elevel),
|
||||
bool process_symlinks,
|
||||
int elevel)
|
||||
{
|
||||
DIR *dir;
|
||||
struct dirent *de;
|
||||
|
||||
dir = AllocateDir(path);
|
||||
while ((de = ReadDir(dir, path)) != NULL)
|
||||
if (dir == NULL)
|
||||
{
|
||||
ereport(elevel,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not open directory \"%s\": %m", path)));
|
||||
return;
|
||||
}
|
||||
|
||||
while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
|
||||
{
|
||||
char subpath[MAXPGPATH];
|
||||
struct stat fst;
|
||||
int sret;
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
@ -2062,60 +2167,132 @@ walkdir(char *path, void (*action) (char *fname, bool isdir))
|
||||
|
||||
snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
|
||||
|
||||
if (lstat(subpath, &fst) < 0)
|
||||
ereport(ERROR,
|
||||
if (process_symlinks)
|
||||
sret = stat(subpath, &fst);
|
||||
else
|
||||
sret = lstat(subpath, &fst);
|
||||
|
||||
if (sret < 0)
|
||||
{
|
||||
ereport(elevel,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not stat file \"%s\": %m", subpath)));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (S_ISREG(fst.st_mode))
|
||||
(*action) (subpath, false);
|
||||
(*action) (subpath, false, elevel);
|
||||
else if (S_ISDIR(fst.st_mode))
|
||||
walkdir(subpath, action);
|
||||
#ifndef WIN32
|
||||
else if (S_ISLNK(fst.st_mode))
|
||||
#else
|
||||
else if (pgwin32_is_junction(subpath))
|
||||
#endif
|
||||
{
|
||||
#if defined(HAVE_READLINK) || defined(WIN32)
|
||||
char linkpath[MAXPGPATH];
|
||||
int len;
|
||||
struct stat lst;
|
||||
|
||||
len = readlink(subpath, linkpath, sizeof(linkpath)-1);
|
||||
if (len < 0)
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not read symbolic link \"%s\": %m",
|
||||
subpath)));
|
||||
|
||||
if (len >= sizeof(linkpath)-1)
|
||||
ereport(ERROR,
|
||||
(errmsg("symbolic link \"%s\" target is too long",
|
||||
subpath)));
|
||||
|
||||
linkpath[len] = '\0';
|
||||
|
||||
if (lstat(linkpath, &lst) == 0)
|
||||
{
|
||||
if (S_ISREG(lst.st_mode))
|
||||
(*action) (linkpath, false);
|
||||
else if (S_ISDIR(lst.st_mode))
|
||||
walkdir(subpath, action);
|
||||
}
|
||||
else if (errno != ENOENT)
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not stat file \"%s\": %m", linkpath)));
|
||||
#else
|
||||
ereport(WARNING,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("this platform does not support symbolic links; ignoring \"%s\"",
|
||||
subpath)));
|
||||
#endif
|
||||
}
|
||||
walkdir(subpath, action, false, elevel);
|
||||
}
|
||||
FreeDir(dir);
|
||||
|
||||
(*action) (path, true);
|
||||
FreeDir(dir); /* we ignore any error here */
|
||||
|
||||
/*
|
||||
* It's important to fsync the destination directory itself as individual
|
||||
* file fsyncs don't guarantee that the directory entry for the file is
|
||||
* synced.
|
||||
*/
|
||||
(*action) (path, true, elevel);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Hint to the OS that it should get ready to fsync() this file.
|
||||
*
|
||||
* Ignores errors trying to open unreadable files, and logs other errors at a
|
||||
* caller-specified level.
|
||||
*/
|
||||
#ifdef PG_FLUSH_DATA_WORKS
|
||||
|
||||
static void
|
||||
pre_sync_fname(const char *fname, bool isdir, int elevel)
|
||||
{
|
||||
int fd;
|
||||
|
||||
fd = BasicOpenFile((char *) fname, O_RDONLY | PG_BINARY, 0);
|
||||
|
||||
if (fd < 0)
|
||||
{
|
||||
if (errno == EACCES || (isdir && errno == EISDIR))
|
||||
return;
|
||||
|
||||
#ifdef ETXTBSY
|
||||
if (errno == ETXTBSY)
|
||||
return;
|
||||
#endif
|
||||
|
||||
ereport(elevel,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not open file \"%s\": %m", fname)));
|
||||
return;
|
||||
}
|
||||
|
||||
(void) pg_flush_data(fd, 0, 0);
|
||||
|
||||
(void) close(fd);
|
||||
}
|
||||
|
||||
#endif /* PG_FLUSH_DATA_WORKS */
|
||||
|
||||
/*
|
||||
* fsync_fname_ext -- Try to fsync a file or directory
|
||||
*
|
||||
* Ignores errors trying to open unreadable files, or trying to fsync
|
||||
* directories on systems where that isn't allowed/required, and logs other
|
||||
* errors at a caller-specified level.
|
||||
*/
|
||||
static void
|
||||
fsync_fname_ext(const char *fname, bool isdir, int elevel)
|
||||
{
|
||||
int fd;
|
||||
int flags;
|
||||
int returncode;
|
||||
|
||||
/*
|
||||
* Some OSs require directories to be opened read-only whereas other
|
||||
* systems don't allow us to fsync files opened read-only; so we need both
|
||||
* cases here. Using O_RDWR will cause us to fail to fsync files that are
|
||||
* not writable by our userid, but we assume that's OK.
|
||||
*/
|
||||
flags = PG_BINARY;
|
||||
if (!isdir)
|
||||
flags |= O_RDWR;
|
||||
else
|
||||
flags |= O_RDONLY;
|
||||
|
||||
/*
|
||||
* Open the file, silently ignoring errors about unreadable files (or
|
||||
* unsupported operations, e.g. opening a directory under Windows), and
|
||||
* logging others.
|
||||
*/
|
||||
fd = BasicOpenFile((char *) fname, flags, 0);
|
||||
if (fd < 0)
|
||||
{
|
||||
if (errno == EACCES || (isdir && errno == EISDIR))
|
||||
return;
|
||||
|
||||
#ifdef ETXTBSY
|
||||
if (errno == ETXTBSY)
|
||||
return;
|
||||
#endif
|
||||
|
||||
ereport(elevel,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not open file \"%s\": %m", fname)));
|
||||
return;
|
||||
}
|
||||
|
||||
returncode = pg_fsync(fd);
|
||||
|
||||
/*
|
||||
* Some OSes don't allow us to fsync directories at all, so we can ignore
|
||||
* those errors. Anything else needs to be logged.
|
||||
*/
|
||||
if (returncode != 0 && !(isdir && errno == EBADF))
|
||||
ereport(elevel,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not fsync file \"%s\": %m", fname)));
|
||||
|
||||
(void) close(fd);
|
||||
}
|
||||
|
@ -100,8 +100,7 @@ extern int pg_fsync_writethrough(int fd);
|
||||
extern int pg_fdatasync(int fd);
|
||||
extern int pg_flush_data(int fd, off_t offset, off_t amount);
|
||||
extern void fsync_fname(char *fname, bool isdir);
|
||||
extern void pre_sync_fname(char *fname, bool isdir);
|
||||
extern void walkdir(char *path, void (*action) (char *fname, bool isdir));
|
||||
extern void SyncDataDirectory(void);
|
||||
|
||||
/* Filename components for OpenTemporaryFile */
|
||||
#define PG_TEMP_FILES_DIR "pgsql_tmp"
|
||||
|
Loading…
Reference in New Issue
Block a user