Merge remote-tracking branch 'origin/mdb.RE/0.9'

This commit is contained in:
Quanah Gibson-Mount 2015-01-13 10:24:08 -06:00
commit 9328a41f49
3 changed files with 78 additions and 38 deletions

View File

@ -11,6 +11,7 @@
# - MDB_USE_POSIX_SEM
# - MDB_DSYNC
# - MDB_FDATASYNC
# - MDB_FDATASYNC_WORKS
# - MDB_USE_PWRITEV
#
# There may be other macros in mdb.c of interest. You should

View File

@ -516,8 +516,8 @@ int mdb_env_create(MDB_env **env);
* and uses fewer mallocs, but loses protection from application bugs
* like wild pointer writes and other bad updates into the database.
* Incompatible with nested transactions.
* Processes with and without MDB_WRITEMAP on the same environment do
* not cooperate well.
* Do not mix processes with and without MDB_WRITEMAP on the same
* environment. This can defeat durability (#mdb_env_sync etc).
* <li>#MDB_NOMETASYNC
* Flush system buffers to disk only once per transaction, omit the
* metadata flush. Defer that until the system flushes files to disk,
@ -698,7 +698,8 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat);
* Data is always written to disk when #mdb_txn_commit() is called,
* but the operating system may keep it buffered. LMDB always flushes
* the OS buffers upon commit as well, unless the environment was
* opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC.
* opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is
* not valid if the environment was opened with #MDB_RDONLY.
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] force If non-zero, force a synchronous flush. Otherwise
* if the environment has the #MDB_NOSYNC flag set the flushes
@ -706,6 +707,7 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat);
* @return A non-zero error value on failure and 0 on success. Some possible
* errors are:
* <ul>
* <li>EACCES - the environment is read-only.
* <li>EINVAL - an invalid parameter was specified.
* <li>EIO - an error occurred during synchronization.
* </ul>

View File

@ -79,6 +79,14 @@ extern int cacheflush(char *addr, int nbytes, int cache);
#define CACHEFLUSH(addr, bytes, cache)
#endif
#if defined(__linux) && !defined(MDB_FDATASYNC_WORKS)
/** fdatasync is broken on ext3/ext4fs on older kernels, see
* description in #mdb_env_open2 comments. You can safely
* define MDB_FDATASYNC_WORKS if this code will only be run
* on kernels 3.6 and newer.
*/
#define BROKEN_FDATASYNC
#endif
#include <errno.h>
#include <limits.h>
@ -333,7 +341,6 @@ mdb_sem_wait(sem_t *sem)
*/
#ifndef MDB_FDATASYNC
# define MDB_FDATASYNC fdatasync
# define HAVE_FDATASYNC 1
#endif
#ifndef MDB_MSYNC
@ -1097,6 +1104,8 @@ struct MDB_env {
#define MDB_ENV_ACTIVE 0x20000000U
/** me_txkey is set */
#define MDB_ENV_TXKEY 0x10000000U
/** fdatasync is unreliable */
#define MDB_FSYNCONLY 0x08000000U
uint32_t me_flags; /**< @ref mdb_env */
unsigned int me_psize; /**< DB page size, inited from me_os_psize */
unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
@ -1113,7 +1122,7 @@ struct MDB_env {
MDB_txn *me_txn; /**< current write transaction */
MDB_txn *me_txn0; /**< prealloc'd write transaction */
size_t me_mapsize; /**< size of the data memory map */
size_t me_size; /**< current file size */
off_t me_size; /**< current file size */
pgno_t me_maxpg; /**< me_mapsize / me_psize */
MDB_dbx *me_dbxs; /**< array of static DB info */
uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
@ -2299,19 +2308,12 @@ fail:
return rc;
}
/* internal env_sync flags: */
#define FORCE 1 /* as before, force a flush */
#define FGREW 0x8000 /* file has grown, do a full fsync instead of just
fdatasync. We shouldn't have to do this, according to the POSIX spec.
But common Linux FSs violate the spec and won't sync required metadata
correctly when the file grows. This only makes a difference if the
platform actually distinguishes fdatasync from fsync.
http://www.openldap.org/lists/openldap-devel/201411/msg00000.html */
static int
mdb_env_sync0(MDB_env *env, int flag)
int
mdb_env_sync(MDB_env *env, int force)
{
int rc = 0, force = flag & FORCE;
int rc = 0;
if (env->me_flags & MDB_RDONLY)
return EACCES;
if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
if (env->me_flags & MDB_WRITEMAP) {
int flags = ((env->me_flags & MDB_MAPASYNC) && !force)
@ -2323,9 +2325,9 @@ mdb_env_sync0(MDB_env *env, int flag)
rc = ErrCode();
#endif
} else {
#ifdef HAVE_FDATASYNC
if (flag & FGREW) {
if (fsync(env->me_fd)) /* Avoid ext-fs bugs, do full sync */
#ifdef BROKEN_FDATASYNC
if (env->me_flags & MDB_FSYNCONLY) {
if (fsync(env->me_fd))
rc = ErrCode();
} else
#endif
@ -2336,12 +2338,6 @@ mdb_env_sync0(MDB_env *env, int flag)
return rc;
}
int
mdb_env_sync(MDB_env *env, int force)
{
return mdb_env_sync0(env, force != 0);
}
/** Back up parent txn's cursors, then grab the originals for tracking */
static int
mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst)
@ -3394,15 +3390,8 @@ mdb_txn_commit(MDB_txn *txn)
mdb_audit(txn);
#endif
i = 0;
#ifdef HAVE_FDATASYNC
if (txn->mt_next_pgno * env->me_psize > env->me_size) {
i |= FGREW;
env->me_size = txn->mt_next_pgno * env->me_psize;
}
#endif
if ((rc = mdb_page_flush(txn, 0)) ||
(rc = mdb_env_sync0(env, i)) ||
(rc = mdb_env_sync(env, 0)) ||
(rc = mdb_env_write_meta(txn)))
goto fail;
@ -3879,6 +3868,11 @@ mdb_fsize(HANDLE fd, size_t *size)
return MDB_SUCCESS;
}
#ifdef BROKEN_FDATASYNC
#include <sys/utsname.h>
#include <sys/vfs.h>
#endif
/** Further setup required for opening an LMDB environment
*/
static int ESECT
@ -3896,6 +3890,53 @@ mdb_env_open2(MDB_env *env)
else
env->me_pidquery = PROCESS_QUERY_INFORMATION;
#endif /* _WIN32 */
#ifdef BROKEN_FDATASYNC
/* ext3/ext4 fdatasync is broken on some older Linux kernels.
* https://lkml.org/lkml/2012/9/3/83
* Kernels after 3.6-rc6 are known good.
* https://lkml.org/lkml/2012/9/10/556
* See if the DB is on ext3/ext4, then check for new enough kernel
* Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known
* to be patched.
*/
{
struct statfs st;
fstatfs(env->me_fd, &st);
while (st.f_type == 0xEF53) {
struct utsname uts;
int i;
uname(&uts);
if (uts.release[0] < '3') {
if (!strncmp(uts.release, "2.6.32.", 7)) {
i = atoi(uts.release+7);
if (i >= 60)
break; /* 2.6.32.60 and newer is OK */
} else if (!strncmp(uts.release, "2.6.34.", 7)) {
i = atoi(uts.release+7);
if (i >= 15)
break; /* 2.6.34.15 and newer is OK */
}
} else if (uts.release[0] == '3') {
i = atoi(uts.release+2);
if (i > 5)
break; /* 3.6 and newer is OK */
if (i == 5) {
i = atoi(uts.release+4);
if (i >= 4)
break; /* 3.5.4 and newer is OK */
} else if (i == 2) {
i = atoi(uts.release+4);
if (i >= 30)
break; /* 3.2.30 and newer is OK */
}
} else { /* 4.x and newer is OK */
break;
}
env->me_flags |= MDB_FSYNCONLY;
break;
}
}
#endif
memset(&meta, 0, sizeof(meta));
@ -3926,10 +3967,6 @@ mdb_env_open2(MDB_env *env)
env->me_mapsize = minsize;
}
rc = mdb_fsize(env->me_fd, &env->me_size);
if (rc)
return rc;
rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL);
if (rc)
return rc;