diff --git a/libraries/liblmdb/Makefile b/libraries/liblmdb/Makefile
index bca5cd3819..2d0983eff0 100644
--- a/libraries/liblmdb/Makefile
+++ b/libraries/liblmdb/Makefile
@@ -11,6 +11,7 @@
# - MDB_USE_POSIX_SEM
# - MDB_DSYNC
# - MDB_FDATASYNC
+# - MDB_FDATASYNC_WORKS
# - MDB_USE_PWRITEV
#
# There may be other macros in mdb.c of interest. You should
diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h
index 408b6ed0ea..547c852fb9 100644
--- a/libraries/liblmdb/lmdb.h
+++ b/libraries/liblmdb/lmdb.h
@@ -516,8 +516,8 @@ int mdb_env_create(MDB_env **env);
* and uses fewer mallocs, but loses protection from application bugs
* like wild pointer writes and other bad updates into the database.
* Incompatible with nested transactions.
- * Processes with and without MDB_WRITEMAP on the same environment do
- * not cooperate well.
+ * Do not mix processes with and without MDB_WRITEMAP on the same
+ * environment. This can defeat durability (#mdb_env_sync etc).
*
#MDB_NOMETASYNC
* Flush system buffers to disk only once per transaction, omit the
* metadata flush. Defer that until the system flushes files to disk,
@@ -698,7 +698,8 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat);
* Data is always written to disk when #mdb_txn_commit() is called,
* but the operating system may keep it buffered. LMDB always flushes
* the OS buffers upon commit as well, unless the environment was
- * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC.
+ * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is
+ * not valid if the environment was opened with #MDB_RDONLY.
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] force If non-zero, force a synchronous flush. Otherwise
* if the environment has the #MDB_NOSYNC flag set the flushes
@@ -706,6 +707,7 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat);
* @return A non-zero error value on failure and 0 on success. Some possible
* errors are:
*
+ * - EACCES - the environment is read-only.
*
- EINVAL - an invalid parameter was specified.
*
- EIO - an error occurred during synchronization.
*
diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c
index 6cc1422c7e..5f287f4b18 100644
--- a/libraries/liblmdb/mdb.c
+++ b/libraries/liblmdb/mdb.c
@@ -79,6 +79,14 @@ extern int cacheflush(char *addr, int nbytes, int cache);
#define CACHEFLUSH(addr, bytes, cache)
#endif
+#if defined(__linux) && !defined(MDB_FDATASYNC_WORKS)
+/** fdatasync is broken on ext3/ext4fs on older kernels, see
+ * description in #mdb_env_open2 comments. You can safely
+ * define MDB_FDATASYNC_WORKS if this code will only be run
+ * on kernels 3.6 and newer.
+ */
+#define BROKEN_FDATASYNC
+#endif
#include
#include
@@ -333,7 +341,6 @@ mdb_sem_wait(sem_t *sem)
*/
#ifndef MDB_FDATASYNC
# define MDB_FDATASYNC fdatasync
-# define HAVE_FDATASYNC 1
#endif
#ifndef MDB_MSYNC
@@ -1097,6 +1104,8 @@ struct MDB_env {
#define MDB_ENV_ACTIVE 0x20000000U
/** me_txkey is set */
#define MDB_ENV_TXKEY 0x10000000U
+ /** fdatasync is unreliable */
+#define MDB_FSYNCONLY 0x08000000U
uint32_t me_flags; /**< @ref mdb_env */
unsigned int me_psize; /**< DB page size, inited from me_os_psize */
unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
@@ -1113,7 +1122,7 @@ struct MDB_env {
MDB_txn *me_txn; /**< current write transaction */
MDB_txn *me_txn0; /**< prealloc'd write transaction */
size_t me_mapsize; /**< size of the data memory map */
- size_t me_size; /**< current file size */
+ off_t me_size; /**< current file size */
pgno_t me_maxpg; /**< me_mapsize / me_psize */
MDB_dbx *me_dbxs; /**< array of static DB info */
uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
@@ -2299,19 +2308,12 @@ fail:
return rc;
}
-/* internal env_sync flags: */
-#define FORCE 1 /* as before, force a flush */
-#define FGREW 0x8000 /* file has grown, do a full fsync instead of just
- fdatasync. We shouldn't have to do this, according to the POSIX spec.
- But common Linux FSs violate the spec and won't sync required metadata
- correctly when the file grows. This only makes a difference if the
- platform actually distinguishes fdatasync from fsync.
- http://www.openldap.org/lists/openldap-devel/201411/msg00000.html */
-
-static int
-mdb_env_sync0(MDB_env *env, int flag)
+int
+mdb_env_sync(MDB_env *env, int force)
{
- int rc = 0, force = flag & FORCE;
+ int rc = 0;
+ if (env->me_flags & MDB_RDONLY)
+ return EACCES;
if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
if (env->me_flags & MDB_WRITEMAP) {
int flags = ((env->me_flags & MDB_MAPASYNC) && !force)
@@ -2323,9 +2325,9 @@ mdb_env_sync0(MDB_env *env, int flag)
rc = ErrCode();
#endif
} else {
-#ifdef HAVE_FDATASYNC
- if (flag & FGREW) {
- if (fsync(env->me_fd)) /* Avoid ext-fs bugs, do full sync */
+#ifdef BROKEN_FDATASYNC
+ if (env->me_flags & MDB_FSYNCONLY) {
+ if (fsync(env->me_fd))
rc = ErrCode();
} else
#endif
@@ -2336,12 +2338,6 @@ mdb_env_sync0(MDB_env *env, int flag)
return rc;
}
-int
-mdb_env_sync(MDB_env *env, int force)
-{
- return mdb_env_sync0(env, force != 0);
-}
-
/** Back up parent txn's cursors, then grab the originals for tracking */
static int
mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst)
@@ -3394,15 +3390,8 @@ mdb_txn_commit(MDB_txn *txn)
mdb_audit(txn);
#endif
- i = 0;
-#ifdef HAVE_FDATASYNC
- if (txn->mt_next_pgno * env->me_psize > env->me_size) {
- i |= FGREW;
- env->me_size = txn->mt_next_pgno * env->me_psize;
- }
-#endif
if ((rc = mdb_page_flush(txn, 0)) ||
- (rc = mdb_env_sync0(env, i)) ||
+ (rc = mdb_env_sync(env, 0)) ||
(rc = mdb_env_write_meta(txn)))
goto fail;
@@ -3879,6 +3868,11 @@ mdb_fsize(HANDLE fd, size_t *size)
return MDB_SUCCESS;
}
+#ifdef BROKEN_FDATASYNC
+#include
+#include
+#endif
+
/** Further setup required for opening an LMDB environment
*/
static int ESECT
@@ -3896,6 +3890,53 @@ mdb_env_open2(MDB_env *env)
else
env->me_pidquery = PROCESS_QUERY_INFORMATION;
#endif /* _WIN32 */
+#ifdef BROKEN_FDATASYNC
+ /* ext3/ext4 fdatasync is broken on some older Linux kernels.
+ * https://lkml.org/lkml/2012/9/3/83
+ * Kernels after 3.6-rc6 are known good.
+ * https://lkml.org/lkml/2012/9/10/556
+ * See if the DB is on ext3/ext4, then check for new enough kernel
+ * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known
+ * to be patched.
+ */
+ {
+ struct statfs st;
+ fstatfs(env->me_fd, &st);
+ while (st.f_type == 0xEF53) {
+ struct utsname uts;
+ int i;
+ uname(&uts);
+ if (uts.release[0] < '3') {
+ if (!strncmp(uts.release, "2.6.32.", 7)) {
+ i = atoi(uts.release+7);
+ if (i >= 60)
+ break; /* 2.6.32.60 and newer is OK */
+ } else if (!strncmp(uts.release, "2.6.34.", 7)) {
+ i = atoi(uts.release+7);
+ if (i >= 15)
+ break; /* 2.6.34.15 and newer is OK */
+ }
+ } else if (uts.release[0] == '3') {
+ i = atoi(uts.release+2);
+ if (i > 5)
+ break; /* 3.6 and newer is OK */
+ if (i == 5) {
+ i = atoi(uts.release+4);
+ if (i >= 4)
+ break; /* 3.5.4 and newer is OK */
+ } else if (i == 2) {
+ i = atoi(uts.release+4);
+ if (i >= 30)
+ break; /* 3.2.30 and newer is OK */
+ }
+ } else { /* 4.x and newer is OK */
+ break;
+ }
+ env->me_flags |= MDB_FSYNCONLY;
+ break;
+ }
+ }
+#endif
memset(&meta, 0, sizeof(meta));
@@ -3926,10 +3967,6 @@ mdb_env_open2(MDB_env *env)
env->me_mapsize = minsize;
}
- rc = mdb_fsize(env->me_fd, &env->me_size);
- if (rc)
- return rc;
-
rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL);
if (rc)
return rc;