From 888a76f6512ed83af99cc4c5871abbbf907ccb02 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 8 Jan 2015 12:56:42 +0000 Subject: [PATCH 1/6] Revert "Fix prev commit for env_sync0" This reverts commit e95c9231fc8de567b724477373259e07c62ce210. Revert "Hack for potential ext3/ext4 corruption issue" This reverts commit 91155b9d676f8abe3fe5e8a96b22b4dd51f963dd. --- libraries/liblmdb/mdb.c | 43 +++++------------------------------------ 1 file changed, 5 insertions(+), 38 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 6cc1422c7e..0867af7f52 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -333,7 +333,6 @@ mdb_sem_wait(sem_t *sem) */ #ifndef MDB_FDATASYNC # define MDB_FDATASYNC fdatasync -# define HAVE_FDATASYNC 1 #endif #ifndef MDB_MSYNC @@ -1113,7 +1112,7 @@ struct MDB_env { MDB_txn *me_txn; /**< current write transaction */ MDB_txn *me_txn0; /**< prealloc'd write transaction */ size_t me_mapsize; /**< size of the data memory map */ - size_t me_size; /**< current file size */ + off_t me_size; /**< current file size */ pgno_t me_maxpg; /**< me_mapsize / me_psize */ MDB_dbx *me_dbxs; /**< array of static DB info */ uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ @@ -2299,19 +2298,10 @@ fail: return rc; } -/* internal env_sync flags: */ -#define FORCE 1 /* as before, force a flush */ -#define FGREW 0x8000 /* file has grown, do a full fsync instead of just - fdatasync. We shouldn't have to do this, according to the POSIX spec. - But common Linux FSs violate the spec and won't sync required metadata - correctly when the file grows. This only makes a difference if the - platform actually distinguishes fdatasync from fsync. - http://www.openldap.org/lists/openldap-devel/201411/msg00000.html */ - -static int -mdb_env_sync0(MDB_env *env, int flag) +int +mdb_env_sync(MDB_env *env, int force) { - int rc = 0, force = flag & FORCE; + int rc = 0; if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { if (env->me_flags & MDB_WRITEMAP) { int flags = ((env->me_flags & MDB_MAPASYNC) && !force) @@ -2323,12 +2313,6 @@ mdb_env_sync0(MDB_env *env, int flag) rc = ErrCode(); #endif } else { -#ifdef HAVE_FDATASYNC - if (flag & FGREW) { - if (fsync(env->me_fd)) /* Avoid ext-fs bugs, do full sync */ - rc = ErrCode(); - } else -#endif if (MDB_FDATASYNC(env->me_fd)) rc = ErrCode(); } @@ -2336,12 +2320,6 @@ mdb_env_sync0(MDB_env *env, int flag) return rc; } -int -mdb_env_sync(MDB_env *env, int force) -{ - return mdb_env_sync0(env, force != 0); -} - /** Back up parent txn's cursors, then grab the originals for tracking */ static int mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) @@ -3394,15 +3372,8 @@ mdb_txn_commit(MDB_txn *txn) mdb_audit(txn); #endif - i = 0; -#ifdef HAVE_FDATASYNC - if (txn->mt_next_pgno * env->me_psize > env->me_size) { - i |= FGREW; - env->me_size = txn->mt_next_pgno * env->me_psize; - } -#endif if ((rc = mdb_page_flush(txn, 0)) || - (rc = mdb_env_sync0(env, i)) || + (rc = mdb_env_sync(env, 0)) || (rc = mdb_env_write_meta(txn))) goto fail; @@ -3926,10 +3897,6 @@ mdb_env_open2(MDB_env *env) env->me_mapsize = minsize; } - rc = mdb_fsize(env->me_fd, &env->me_size); - if (rc) - return rc; - rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); if (rc) return rc; From 4500d49f362502c8f3117e335190d72e33bcd126 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 8 Jan 2015 13:03:55 +0000 Subject: [PATCH 2/6] fdatasync hack, again Check for ext3/ext4 fs, then check kernel version. --- libraries/liblmdb/mdb.c | 61 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 0867af7f52..264c15fc9f 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1142,6 +1142,9 @@ struct MDB_env { #elif defined(MDB_USE_POSIX_SEM) sem_t *me_rmutex; /* Shared mutexes are not supported */ sem_t *me_wmutex; +#endif +#ifdef __linux + int me_fsynconly; /**< fdatasync is unreliable */ #endif void *me_userctx; /**< User-settable context */ MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ @@ -2313,6 +2316,12 @@ mdb_env_sync(MDB_env *env, int force) rc = ErrCode(); #endif } else { +#ifdef __linux + if (env->me_fsynconly) { + if (fsync(env->me_fd)) + rc = ErrCode(); + } else +#endif if (MDB_FDATASYNC(env->me_fd)) rc = ErrCode(); } @@ -3850,6 +3859,11 @@ mdb_fsize(HANDLE fd, size_t *size) return MDB_SUCCESS; } +#ifdef __linux +#include +#include +#endif + /** Further setup required for opening an LMDB environment */ static int ESECT @@ -3867,6 +3881,53 @@ mdb_env_open2(MDB_env *env) else env->me_pidquery = PROCESS_QUERY_INFORMATION; #endif /* _WIN32 */ +#ifdef __linux + /* ext3/ext4 fdatasync is broken on some older Linux kernels. + * https://lkml.org/lkml/2012/9/3/83 + * Kernels after 3.6-rc6 are known good. + * https://lkml.org/lkml/2012/9/10/556 + * See if the DB is on ext3/ext4, then check for new enough kernel + * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known + * to be patched. + */ + { + struct statfs st; + fstatfs(env->me_fd, &st); + while (st.f_type == 0xEF53) { + struct utsname uts; + int i; + uname(&uts); + if (uts.release[0] < '3') { + if (!strncmp(uts.release, "2.6.32.", 7)) { + i = atoi(uts.release+7); + if (i >= 60) + break; /* 2.6.32.60 and newer is OK */ + } else if (!strncmp(uts.release, "2.6.34.", 7)) { + i = atoi(uts.release+7); + if (i >= 15) + break; /* 2.6.34.15 and newer is OK */ + } + } else if (uts.release[0] == '3') { + i = atoi(uts.release+2); + if (i > 5) + break; /* 3.6 and newer is OK */ + if (i == 5) { + i = atoi(uts.release+4); + if (i >= 4) + break; /* 3.5.4 and newer is OK */ + } else if (i == 2) { + i = atoi(uts.release+4); + if (i >= 30) + break; /* 3.2.30 and newer is OK */ + } + } else { /* 4.x and newer is OK */ + break; + } + env->me_fsynconly = 1; + break; + } + } +#endif memset(&meta, 0, sizeof(meta)); From 0b9f42d1f4c2b36ab1e44ca7e62fddc0abc300a2 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Fri, 9 Jan 2015 11:25:07 +0000 Subject: [PATCH 3/6] ITS#8021 env_sync is invalid in RDONLY env --- libraries/liblmdb/lmdb.h | 4 +++- libraries/liblmdb/mdb.c | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 408b6ed0ea..35f5f420e3 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -698,7 +698,8 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat); * Data is always written to disk when #mdb_txn_commit() is called, * but the operating system may keep it buffered. LMDB always flushes * the OS buffers upon commit as well, unless the environment was - * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is + * not valid if the environment was opened with #MDB_RDONLY. * @param[in] env An environment handle returned by #mdb_env_create() * @param[in] force If non-zero, force a synchronous flush. Otherwise * if the environment has the #MDB_NOSYNC flag set the flushes @@ -706,6 +707,7 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat); * @return A non-zero error value on failure and 0 on success. Some possible * errors are: *
    + *
  • EACCES - the environment is read-only. *
  • EINVAL - an invalid parameter was specified. *
  • EIO - an error occurred during synchronization. *
diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 264c15fc9f..a575768508 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2305,6 +2305,8 @@ int mdb_env_sync(MDB_env *env, int force) { int rc = 0; + if (env->me_flags & MDB_RDONLY) + return EACCES; if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { if (env->me_flags & MDB_WRITEMAP) { int flags = ((env->me_flags & MDB_MAPASYNC) && !force) From e85c944a3a385dd7459985f4eec2edc06591c36d Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Sun, 11 Jan 2015 11:41:08 +0000 Subject: [PATCH 4/6] Tweak conditionals for fdatasync hack --- libraries/liblmdb/Makefile | 1 + libraries/liblmdb/mdb.c | 14 ++++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/libraries/liblmdb/Makefile b/libraries/liblmdb/Makefile index bca5cd3819..2d0983eff0 100644 --- a/libraries/liblmdb/Makefile +++ b/libraries/liblmdb/Makefile @@ -11,6 +11,7 @@ # - MDB_USE_POSIX_SEM # - MDB_DSYNC # - MDB_FDATASYNC +# - MDB_FDATASYNC_WORKS # - MDB_USE_PWRITEV # # There may be other macros in mdb.c of interest. You should diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index a575768508..d93afdc450 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -79,6 +79,12 @@ extern int cacheflush(char *addr, int nbytes, int cache); #define CACHEFLUSH(addr, bytes, cache) #endif +#if defined(__linux) && !defined(MDB_FDATASYNC_WORKS) +/** fdatasync is broken on ext3/ext4fs on older kernels, see + * description in #mdb_env_open2 comments + */ +#define BROKEN_FDATASYNC +#endif #include #include @@ -1143,7 +1149,7 @@ struct MDB_env { sem_t *me_rmutex; /* Shared mutexes are not supported */ sem_t *me_wmutex; #endif -#ifdef __linux +#ifdef BROKEN_FDATASYMC int me_fsynconly; /**< fdatasync is unreliable */ #endif void *me_userctx; /**< User-settable context */ @@ -2318,7 +2324,7 @@ mdb_env_sync(MDB_env *env, int force) rc = ErrCode(); #endif } else { -#ifdef __linux +#ifdef BROKEN_FDATASYNC if (env->me_fsynconly) { if (fsync(env->me_fd)) rc = ErrCode(); @@ -3861,7 +3867,7 @@ mdb_fsize(HANDLE fd, size_t *size) return MDB_SUCCESS; } -#ifdef __linux +#ifdef BROKEN_FDATASYNC #include #include #endif @@ -3883,7 +3889,7 @@ mdb_env_open2(MDB_env *env) else env->me_pidquery = PROCESS_QUERY_INFORMATION; #endif /* _WIN32 */ -#ifdef __linux +#ifdef BROKEN_FDATASYNC /* ext3/ext4 fdatasync is broken on some older Linux kernels. * https://lkml.org/lkml/2012/9/3/83 * Kernels after 3.6-rc6 are known good. From bf3961e3c7ad703ae550bff3706533f734518a30 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Mon, 12 Jan 2015 10:36:38 +0000 Subject: [PATCH 5/6] More cleanup for fdatasync hack --- libraries/liblmdb/mdb.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index d93afdc450..5f287f4b18 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -81,7 +81,9 @@ extern int cacheflush(char *addr, int nbytes, int cache); #if defined(__linux) && !defined(MDB_FDATASYNC_WORKS) /** fdatasync is broken on ext3/ext4fs on older kernels, see - * description in #mdb_env_open2 comments + * description in #mdb_env_open2 comments. You can safely + * define MDB_FDATASYNC_WORKS if this code will only be run + * on kernels 3.6 and newer. */ #define BROKEN_FDATASYNC #endif @@ -1102,6 +1104,8 @@ struct MDB_env { #define MDB_ENV_ACTIVE 0x20000000U /** me_txkey is set */ #define MDB_ENV_TXKEY 0x10000000U + /** fdatasync is unreliable */ +#define MDB_FSYNCONLY 0x08000000U uint32_t me_flags; /**< @ref mdb_env */ unsigned int me_psize; /**< DB page size, inited from me_os_psize */ unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ @@ -1148,9 +1152,6 @@ struct MDB_env { #elif defined(MDB_USE_POSIX_SEM) sem_t *me_rmutex; /* Shared mutexes are not supported */ sem_t *me_wmutex; -#endif -#ifdef BROKEN_FDATASYMC - int me_fsynconly; /**< fdatasync is unreliable */ #endif void *me_userctx; /**< User-settable context */ MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ @@ -2325,7 +2326,7 @@ mdb_env_sync(MDB_env *env, int force) #endif } else { #ifdef BROKEN_FDATASYNC - if (env->me_fsynconly) { + if (env->me_flags & MDB_FSYNCONLY) { if (fsync(env->me_fd)) rc = ErrCode(); } else @@ -3931,7 +3932,7 @@ mdb_env_open2(MDB_env *env) } else { /* 4.x and newer is OK */ break; } - env->me_fsynconly = 1; + env->me_flags |= MDB_FSYNCONLY; break; } } From 0599dee9d496627e67b1c3f97b446ed5de785607 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 12 Jan 2015 12:19:51 +0100 Subject: [PATCH 6/6] ITS#8021 doc: Don't mix MDB_WRITEMAP + non-WRITEMAP --- libraries/liblmdb/lmdb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 35f5f420e3..547c852fb9 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -516,8 +516,8 @@ int mdb_env_create(MDB_env **env); * and uses fewer mallocs, but loses protection from application bugs * like wild pointer writes and other bad updates into the database. * Incompatible with nested transactions. - * Processes with and without MDB_WRITEMAP on the same environment do - * not cooperate well. + * Do not mix processes with and without MDB_WRITEMAP on the same + * environment. This can defeat durability (#mdb_env_sync etc). *
  • #MDB_NOMETASYNC * Flush system buffers to disk only once per transaction, omit the * metadata flush. Defer that until the system flushes files to disk,