From f8c183a1ac02aef14832c1f29946ef2bcb5866b7 Mon Sep 17 00:00:00 2001 From: Greg Stark Date: Mon, 15 Feb 2010 00:50:57 +0000 Subject: [PATCH] Speed up CREATE DATABASE by deferring the fsyncs until after copying all the data and using posix_fadvise to nudge the OS into flushing it earlier. This also hopefully makes CREATE DATABASE avoid spamming the cache. Tests show a big speedup on Linux at least on some filesystems. Idea and patch from Andres Freund. --- src/backend/storage/file/fd.c | 18 +++++++- src/include/storage/fd.h | 3 +- src/port/copydir.c | 84 ++++++++++++++++++++++++----------- 3 files changed, 76 insertions(+), 29 deletions(-) diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index ec27859e60..adea849ab0 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.153 2010/01/12 02:42:52 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.154 2010/02/15 00:50:57 stark Exp $ * * NOTES: * @@ -319,6 +319,22 @@ pg_fdatasync(int fd) return 0; } +/* + * pg_flush_data --- advise OS that the data described won't be needed soon + * + * Not all platforms have posix_fadvise; treat as noop if not available. + */ +int +pg_flush_data(int fd, off_t offset, off_t amount) +{ +#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED); +#else + return 0; +#endif +} + + /* * InitFileAccess --- initialize this module during backend startup * diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 20f60918af..9dd240e34c 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.66 2010/01/02 16:58:08 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.67 2010/02/15 00:50:57 stark Exp $ * *------------------------------------------------------------------------- */ @@ -98,6 +98,7 @@ extern int pg_fsync(int fd); extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); +extern int pg_flush_data(int fd, off_t offset, off_t amount); /* Filename components for OpenTemporaryFile */ #define PG_TEMP_FILES_DIR "pgsql_tmp" diff --git a/src/port/copydir.c b/src/port/copydir.c index 0bf764ecff..a52b1f71a1 100644 --- a/src/port/copydir.c +++ b/src/port/copydir.c @@ -11,7 +11,7 @@ * as a service. * * IDENTIFICATION - * $PostgreSQL: pgsql/src/port/copydir.c,v 1.25 2010/02/14 17:50:52 stark Exp $ + * $PostgreSQL: pgsql/src/port/copydir.c,v 1.26 2010/02/15 00:50:57 stark Exp $ * *------------------------------------------------------------------------- */ @@ -37,6 +37,7 @@ static void copy_file(char *fromfile, char *tofile); +static void fsync_fname(char *fname); /* @@ -91,27 +92,32 @@ copydir(char *fromdir, char *todir, bool recurse) copy_file(fromfile, tofile); } - FreeDir(xldir); - /* - * fsync the directory to make sure not just the data but also the - * new directory file entries have reached the disk. While needed - * by most filesystems, the window got bigger with newer ones like - * ext4. + * Be paranoid here and fsync all files to ensure we catch problems. */ - dirfd = BasicOpenFile(todir, - O_RDONLY | PG_BINARY, - S_IRUSR | S_IWUSR); - if(dirfd == -1) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open directory for fsync \"%s\": %m", todir))); - - if(pg_fsync(dirfd) == -1) + if (xldir == NULL) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not fsync directory \"%s\": %m", todir))); - close(dirfd); + errmsg("could not open directory \"%s\": %m", fromdir))); + + while ((xlde = ReadDir(xldir, fromdir)) != NULL) + { + if (strcmp(xlde->d_name, ".") == 0 || + strcmp(xlde->d_name, "..") == 0) + continue; + + snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name); + fsync_fname(tofile); + } + FreeDir(xldir); + + /* It's important to fsync the destination directory itself as + * individual file fsyncs don't guarantee that the directory entry + * for the file is synced. Recent versions of ext4 have made the + * window much wider but it's been true for ext3 and other + * filesyetems in the past + */ + fsync_fname(todir); } /* @@ -124,6 +130,7 @@ copy_file(char *fromfile, char *tofile) int srcfd; int dstfd; int nbytes; + off_t offset; /* Use palloc to ensure we get a maxaligned buffer */ #define COPY_BUF_SIZE (8 * BLCKSZ) @@ -149,7 +156,7 @@ copy_file(char *fromfile, char *tofile) /* * Do the data copying. */ - for (;;) + for (offset=0; ; offset+=nbytes) { nbytes = read(srcfd, buffer, COPY_BUF_SIZE); if (nbytes < 0) @@ -168,15 +175,14 @@ copy_file(char *fromfile, char *tofile) (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } - } - /* - * Be paranoid here to ensure we catch problems. - */ - if (pg_fsync(dstfd) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", tofile))); + /* + * We fsync the files later but first flush them to avoid spamming + * the cache and hopefully get the kernel to start writing them + * out before the fsync comes. + */ + pg_flush_data(dstfd, offset, nbytes); + } if (close(dstfd)) ereport(ERROR, @@ -187,3 +193,27 @@ copy_file(char *fromfile, char *tofile) pfree(buffer); } + + + +/* + * fsync a file + */ +static void +fsync_fname(char *fname) +{ + int fd = BasicOpenFile(fname, + O_RDONLY | PG_BINARY, + S_IRUSR | S_IWUSR); + + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + + if (pg_fsync(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", fname))); + close(fd); +}