Ugly-as-sin initial Windows port

This commit is contained in:
Howard Chu 2011-09-01 03:38:48 -07:00
parent 2510bea718
commit 30736a0ff5

View File

@ -35,12 +35,16 @@
#include <sys/types.h> #include <sys/types.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/param.h> #include <sys/param.h>
#ifdef _WIN32
#include <windows.h>
#else
#include <sys/uio.h> #include <sys/uio.h>
#include <sys/mman.h> #include <sys/mman.h>
#ifdef HAVE_SYS_FILE_H #ifdef HAVE_SYS_FILE_H
#include <sys/file.h> #include <sys/file.h>
#endif #endif
#include <fcntl.h> #include <fcntl.h>
#endif
#include <assert.h> #include <assert.h>
#include <errno.h> #include <errno.h>
@ -51,8 +55,42 @@
#include <string.h> #include <string.h>
#include <time.h> #include <time.h>
#include <unistd.h> #include <unistd.h>
#ifdef _WIN32
#define pthread_t DWORD
#define pthread_mutex_t HANDLE
#define pthread_key_t DWORD
#define pthread_self() GetCurrentThreadId()
#define pthread_key_create(x,y) *(x) = TlsAlloc()
#define pthread_key_delete(x) TlsFree(x)
#define pthread_getspecific(x) TlsGetValue(x)
#define pthread_setspecific(x,y) TlsSetValue(x,y)
#define pthread_mutex_unlock(x) ReleaseMutex(x)
#define pthread_mutex_lock(x) WaitForSingleObject(x, INFINITE)
#define LOCK_MUTEX_R(env) pthread_mutex_lock(env->me_rmutex)
#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(env->me_rmutex)
#define LOCK_MUTEX_W(env) pthread_mutex_lock(env->me_wmutex)
#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(env->me_wmutex)
#define getpid() GetCurrentProcessId()
#else
#include <pthread.h> #include <pthread.h>
#define LOCK_MUTEX_R(env) pthread_mutex_lock(&env->me_txns->mti_mutex)
#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&env->me_txns->mti_mutex)
#define LOCK_MUTEX_W(env) pthread_mutex_lock(&env->me_txns->mti_wmutex)
#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&env->me_txns->mti_wmutex)
#endif
#ifdef _WIN32
#define fdatasync(fd) !FlushFileBuffers(fd)
#define ErrCode() GetLastError()
#define GetPageSize(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
#define close(fd) CloseHandle(fd)
#define munmap(ptr,len) UnmapViewOfFile(ptr)
#else
#define ErrCode() errno
#define HANDLE int
#define INVALID_HANDLE_VALUE -1
#define GetPageSize(x) (x) = sysconf(_SC_PAGE_SIZE)
#endif
#include "mdb.h" #include "mdb.h"
#define ULONG unsigned long #define ULONG unsigned long
@ -156,7 +194,16 @@ typedef struct MDB_reader {
typedef struct MDB_txbody { typedef struct MDB_txbody {
uint32_t mtb_magic; uint32_t mtb_magic;
uint32_t mtb_version; uint32_t mtb_version;
/* For POSIX the actual mutexes reside in shared memory.
* On Windows, mutexes are allocated by the kernel; we store
* the name in shared memory so that other processes can
* grab them.
*/
#ifdef _WIN32
char mtb_rmname[32];
#else
pthread_mutex_t mtb_mutex; pthread_mutex_t mtb_mutex;
#endif
ULONG mtb_txnid; ULONG mtb_txnid;
uint32_t mtb_numreaders; uint32_t mtb_numreaders;
uint32_t mtb_me_toggle; uint32_t mtb_me_toggle;
@ -168,14 +215,20 @@ typedef struct MDB_txninfo {
#define mti_magic mt1.mtb.mtb_magic #define mti_magic mt1.mtb.mtb_magic
#define mti_version mt1.mtb.mtb_version #define mti_version mt1.mtb.mtb_version
#define mti_mutex mt1.mtb.mtb_mutex #define mti_mutex mt1.mtb.mtb_mutex
#define mti_rmname mt1.mtb.mtb_rmname
#define mti_txnid mt1.mtb.mtb_txnid #define mti_txnid mt1.mtb.mtb_txnid
#define mti_numreaders mt1.mtb.mtb_numreaders #define mti_numreaders mt1.mtb.mtb_numreaders
#define mti_me_toggle mt1.mtb.mtb_me_toggle #define mti_me_toggle mt1.mtb.mtb_me_toggle
char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
} mt1; } mt1;
union { union {
#ifdef _WIN32
char mt2_wmname[32];
#define mti_wmname mt2.mt2_wmname
#else
pthread_mutex_t mt2_wmutex; pthread_mutex_t mt2_wmutex;
#define mti_wmutex mt2.mt2_wmutex #define mti_wmutex mt2.mt2_wmutex
#endif
char pad[(sizeof(pthread_mutex_t)+CACHELINE-1) & ~(CACHELINE-1)]; char pad[(sizeof(pthread_mutex_t)+CACHELINE-1) & ~(CACHELINE-1)];
} mt2; } mt2;
MDB_reader mti_readers[1]; MDB_reader mti_readers[1];
@ -351,9 +404,9 @@ typedef struct MDB_xcursor {
} MDB_xcursor; } MDB_xcursor;
struct MDB_env { struct MDB_env {
int me_fd; HANDLE me_fd;
int me_lfd; HANDLE me_lfd;
int me_mfd; /* just for writing the meta pages */ HANDLE me_mfd; /* just for writing the meta pages */
#define MDB_FATAL_ERROR 0x80000000U #define MDB_FATAL_ERROR 0x80000000U
uint32_t me_flags; uint32_t me_flags;
uint32_t me_extrapad; /* unused for now */ uint32_t me_extrapad; /* unused for now */
@ -378,6 +431,10 @@ struct MDB_env {
pgno_t me_free_pgs[MDB_IDL_UM_SIZE]; pgno_t me_free_pgs[MDB_IDL_UM_SIZE];
MIDL2 me_dirty_list[MDB_IDL_DB_SIZE]; MIDL2 me_dirty_list[MDB_IDL_DB_SIZE];
LAZY_RWLOCK_DEF(me_dblock); LAZY_RWLOCK_DEF(me_dblock);
#ifdef _WIN32
HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */
HANDLE me_wmutex;
#endif
}; };
#define NODESIZE offsetof(MDB_node, mn_data) #define NODESIZE offsetof(MDB_node, mn_data)
@ -464,6 +521,12 @@ static int memncmp(const void *s1, size_t n1,
static int memnrcmp(const void *s1, size_t n1, static int memnrcmp(const void *s1, size_t n1,
const void *s2, size_t n2); const void *s2, size_t n2);
#ifdef _WIN32
static SECURITY_DESCRIPTOR mdb_null_sd;
static SECURITY_ATTRIBUTES mdb_all_sa;
static int mdb_sec_inited;
#endif
static int static int
memncmp(const void *s1, size_t n1, const void *s2, size_t n2) memncmp(const void *s1, size_t n1, const void *s2, size_t n2)
{ {
@ -506,7 +569,7 @@ mdb_version(int *maj, int *min, int *pat)
return MDB_VERSION_STRING; return MDB_VERSION_STRING;
} }
static char *const errstr[] = { static char *const mdb_errstr[] = {
"MDB_KEYEXIST: Key/data pair already exists", "MDB_KEYEXIST: Key/data pair already exists",
"MDB_NOTFOUND: No matching key/data pair found", "MDB_NOTFOUND: No matching key/data pair found",
"MDB_PAGE_NOTFOUND: Requested page not found", "MDB_PAGE_NOTFOUND: Requested page not found",
@ -522,7 +585,7 @@ mdb_strerror(int err)
return ("Successful return: 0"); return ("Successful return: 0");
if (err >= MDB_KEYEXIST && err <= MDB_VERSION_MISMATCH) if (err >= MDB_KEYEXIST && err <= MDB_VERSION_MISMATCH)
return errstr[err - MDB_KEYEXIST]; return mdb_errstr[err - MDB_KEYEXIST];
return strerror(err); return strerror(err);
} }
@ -731,7 +794,7 @@ mdb_env_sync(MDB_env *env, int force)
int rc = 0; int rc = 0;
if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
if (fdatasync(env->me_fd)) if (fdatasync(env->me_fd))
rc = errno; rc = ErrCode();
} }
return rc; return rc;
} }
@ -751,19 +814,19 @@ mdb_txn_renew0(MDB_txn *txn)
pid_t pid = getpid(); pid_t pid = getpid();
pthread_t tid = pthread_self(); pthread_t tid = pthread_self();
pthread_mutex_lock(&env->me_txns->mti_mutex); LOCK_MUTEX_R(env);
for (i=0; i<env->me_txns->mti_numreaders; i++) for (i=0; i<env->me_txns->mti_numreaders; i++)
if (env->me_txns->mti_readers[i].mr_pid == 0) if (env->me_txns->mti_readers[i].mr_pid == 0)
break; break;
if (i == env->me_maxreaders) { if (i == env->me_maxreaders) {
pthread_mutex_unlock(&env->me_txns->mti_mutex); UNLOCK_MUTEX_R(env);
return ENOMEM; return ENOMEM;
} }
env->me_txns->mti_readers[i].mr_pid = pid; env->me_txns->mti_readers[i].mr_pid = pid;
env->me_txns->mti_readers[i].mr_tid = tid; env->me_txns->mti_readers[i].mr_tid = tid;
if (i >= env->me_txns->mti_numreaders) if (i >= env->me_txns->mti_numreaders)
env->me_txns->mti_numreaders = i+1; env->me_txns->mti_numreaders = i+1;
pthread_mutex_unlock(&env->me_txns->mti_mutex); UNLOCK_MUTEX_R(env);
r = &env->me_txns->mti_readers[i]; r = &env->me_txns->mti_readers[i];
pthread_setspecific(env->me_txkey, r); pthread_setspecific(env->me_txkey, r);
} }
@ -772,7 +835,7 @@ mdb_txn_renew0(MDB_txn *txn)
r->mr_txnid = txn->mt_txnid; r->mr_txnid = txn->mt_txnid;
txn->mt_u.reader = r; txn->mt_u.reader = r;
} else { } else {
pthread_mutex_lock(&env->me_txns->mti_wmutex); LOCK_MUTEX_W(env);
txn->mt_txnid = env->me_txns->mti_txnid+1; txn->mt_txnid = env->me_txns->mti_txnid+1;
txn->mt_toggle = env->me_txns->mti_me_toggle; txn->mt_toggle = env->me_txns->mti_me_toggle;
@ -830,7 +893,7 @@ mdb_txn_begin(MDB_env *env, unsigned int flags, MDB_txn **ret)
return MDB_PANIC; return MDB_PANIC;
} }
if ((txn = calloc(1, sizeof(MDB_txn) + env->me_maxdbs * sizeof(MDB_db))) == NULL) { if ((txn = calloc(1, sizeof(MDB_txn) + env->me_maxdbs * sizeof(MDB_db))) == NULL) {
DPRINTF("calloc: %s", strerror(errno)); DPRINTF("calloc: %s", strerror(ErrCode()));
return ENOMEM; return ENOMEM;
} }
txn->mt_dbs = (MDB_db *)(txn+1); txn->mt_dbs = (MDB_db *)(txn+1);
@ -884,7 +947,7 @@ mdb_txn_reset0(MDB_txn *txn)
env->me_txn = NULL; env->me_txn = NULL;
for (i=2; i<env->me_numdbs; i++) for (i=2; i<env->me_numdbs; i++)
env->me_dbxs[i].md_dirty = 0; env->me_dbxs[i].md_dirty = 0;
pthread_mutex_unlock(&env->me_txns->mti_wmutex); UNLOCK_MUTEX_W(env);
} }
} }
@ -925,7 +988,6 @@ mdb_txn_commit(MDB_txn *txn)
MDB_dpage *dp; MDB_dpage *dp;
MDB_env *env; MDB_env *env;
pgno_t next; pgno_t next;
struct iovec iov[MDB_COMMIT_PAGES];
assert(txn != NULL); assert(txn != NULL);
assert(txn->mt_env != NULL); assert(txn->mt_env != NULL);
@ -1041,6 +1103,36 @@ mdb_txn_commit(MDB_txn *txn)
next = 0; next = 0;
i = 1; i = 1;
do { do {
#ifdef _WIN32
/* Windows actually supports scatter/gather I/O, but only on
* unbuffered file handles. Since we're relying on the OS page
* cache for all our data, that's self-defeating. So we just
* write pages one at a time. We use the ov structure to set
* the write offset, to at least save the overhead of a Seek
* system call.
*/
OVERLAPPED ov;
memset(&ov, 0, sizeof(ov));
for (; i<=txn->mt_u.dirty_list[0].mid; i++) {
dp = txn->mt_u.dirty_list[i].mptr;
DPRINTF("committing page %lu", dp->p.mp_pgno);
size = dp->p.mp_pgno * env->me_psize;
ov.Offset = size & 0xffffffff;
ov.OffsetHigh = size >> 16;
ov.OffsetHigh >>= 16;
/* clear dirty flag */
dp->p.mp_flags &= ~P_DIRTY;
rc = WriteFile(env->me_fd, &dp->p, env->me_psize * dp->h.md_num, NULL, &ov);
if (!rc) {
n = ErrCode();
DPRINTF("WriteFile: %d", n);
mdb_txn_abort(txn);
return n;
}
}
done = 1;;
#else
struct iovec iov[MDB_COMMIT_PAGES];
n = 0; n = 0;
done = 1; done = 1;
size = 0; size = 0;
@ -1051,11 +1143,11 @@ mdb_txn_commit(MDB_txn *txn)
DPRINTF("committing %u dirty pages", n); DPRINTF("committing %u dirty pages", n);
rc = writev(env->me_fd, iov, n); rc = writev(env->me_fd, iov, n);
if (rc != size) { if (rc != size) {
n = errno; n = ErrCode();
if (rc > 0) if (rc > 0)
DPUTS("short write, filesystem full?"); DPUTS("short write, filesystem full?");
else else
DPRINTF("writev: %s", strerror(errno)); DPRINTF("writev: %s", strerror(n));
mdb_txn_abort(txn); mdb_txn_abort(txn);
return n; return n;
} }
@ -1085,15 +1177,15 @@ mdb_txn_commit(MDB_txn *txn)
DPRINTF("committing %u dirty pages", n); DPRINTF("committing %u dirty pages", n);
rc = writev(env->me_fd, iov, n); rc = writev(env->me_fd, iov, n);
if (rc != size) { if (rc != size) {
n = errno; n = ErrCode();
if (rc > 0) if (rc > 0)
DPUTS("short write, filesystem full?"); DPUTS("short write, filesystem full?");
else else
DPRINTF("writev: %s", strerror(errno)); DPRINTF("writev: %s", strerror(n));
mdb_txn_abort(txn); mdb_txn_abort(txn);
return n; return n;
} }
#endif
} while (!done); } while (!done);
/* Drop the dirty pages. /* Drop the dirty pages.
@ -1141,7 +1233,7 @@ done:
LAZY_RWLOCK_UNLOCK(&env->me_dblock); LAZY_RWLOCK_UNLOCK(&env->me_dblock);
} }
pthread_mutex_unlock(&env->me_txns->mti_wmutex); UNLOCK_MUTEX_W(env);
free(txn); free(txn);
return MDB_SUCCESS; return MDB_SUCCESS;
@ -1153,20 +1245,25 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta)
char page[PAGESIZE]; char page[PAGESIZE];
MDB_page *p; MDB_page *p;
MDB_meta *m; MDB_meta *m;
int rc; int rc, err;
assert(env != NULL);
/* We don't know the page size yet, so use a minimum value. /* We don't know the page size yet, so use a minimum value.
*/ */
if ((rc = pread(env->me_fd, page, PAGESIZE, 0)) == 0) { #ifdef _WIN32
if (!ReadFile(env->me_fd, page, PAGESIZE, (DWORD *)&rc, NULL) || rc == 0)
#else
if ((rc = read(env->me_fd, page, PAGESIZE)) == 0)
#endif
{
return ENOENT; return ENOENT;
} else if (rc != PAGESIZE) { }
else if (rc != PAGESIZE) {
err = ErrCode();
if (rc > 0) if (rc > 0)
errno = EINVAL; err = EINVAL;
DPRINTF("read: %s", strerror(errno)); DPRINTF("read: %s", strerror(err));
return errno; return err;
} }
p = (MDB_page *)page; p = (MDB_page *)page;
@ -1201,7 +1298,8 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
unsigned int psize; unsigned int psize;
DPUTS("writing new meta page"); DPUTS("writing new meta page");
psize = sysconf(_SC_PAGE_SIZE);
GetPageSize(psize);
meta->mm_magic = MDB_MAGIC; meta->mm_magic = MDB_MAGIC;
meta->mm_version = MDB_VERSION; meta->mm_version = MDB_VERSION;
@ -1227,9 +1325,18 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
m = METADATA(q); m = METADATA(q);
memcpy(m, meta, sizeof(*meta)); memcpy(m, meta, sizeof(*meta));
#ifdef _WIN32
{
DWORD len;
rc = WriteFile(env->me_fd, p, psize * 2, &len, NULL);
rc = (len == psize * 2) ? MDB_SUCCESS : ErrCode();
}
#else
rc = write(env->me_fd, p, psize * 2); rc = write(env->me_fd, p, psize * 2);
rc = (rc == (int)psize * 2) ? MDB_SUCCESS : ErrCode();
#endif
free(p); free(p);
return (rc == (int)psize * 2) ? MDB_SUCCESS : errno; return rc;
} }
static int static int
@ -1240,6 +1347,9 @@ mdb_env_write_meta(MDB_txn *txn)
off_t off; off_t off;
int rc, len, toggle; int rc, len, toggle;
char *ptr; char *ptr;
#ifdef _WIN32
OVERLAPPED ov;
#endif
assert(txn != NULL); assert(txn != NULL);
assert(txn->mt_env != NULL); assert(txn->mt_env != NULL);
@ -1268,10 +1378,18 @@ mdb_env_write_meta(MDB_txn *txn)
off += PAGEHDRSZ; off += PAGEHDRSZ;
/* Write to the SYNC fd */ /* Write to the SYNC fd */
#ifdef _WIN32
{
memset(&ov, 0, sizeof(ov));
ov.Offset = off;
WriteFile(env->me_mfd, ptr, len, (DWORD *)&rc, &ov);
}
#else
rc = pwrite(env->me_mfd, ptr, len, off); rc = pwrite(env->me_mfd, ptr, len, off);
#endif
if (rc != len) { if (rc != len) {
int r2; int r2;
rc = errno; rc = ErrCode();
DPUTS("write failed, disk error?"); DPUTS("write failed, disk error?");
/* On a failure, the pagecache still contains the new data. /* On a failure, the pagecache still contains the new data.
* Write some old data back, to prevent it from being used. * Write some old data back, to prevent it from being used.
@ -1279,7 +1397,11 @@ mdb_env_write_meta(MDB_txn *txn)
*/ */
meta.mm_last_pg = metab.mm_last_pg; meta.mm_last_pg = metab.mm_last_pg;
meta.mm_txnid = metab.mm_txnid; meta.mm_txnid = metab.mm_txnid;
#ifdef _WIN32
WriteFile(env->me_fd, ptr, len, NULL, &ov);
#else
r2 = pwrite(env->me_fd, ptr, len, off); r2 = pwrite(env->me_fd, ptr, len, off);
#endif
env->me_flags |= MDB_FATAL_ERROR; env->me_flags |= MDB_FATAL_ERROR;
return rc; return rc;
} }
@ -1323,9 +1445,9 @@ mdb_env_create(MDB_env **env)
e->me_maxreaders = DEFAULT_READERS; e->me_maxreaders = DEFAULT_READERS;
e->me_maxdbs = 2; e->me_maxdbs = 2;
e->me_fd = -1; e->me_fd = INVALID_HANDLE_VALUE;
e->me_lfd = -1; e->me_lfd = INVALID_HANDLE_VALUE;
e->me_mfd = -1; e->me_mfd = INVALID_HANDLE_VALUE;
*env = e; *env = e;
return MDB_SUCCESS; return MDB_SUCCESS;
} }
@ -1388,13 +1510,41 @@ mdb_env_open2(MDB_env *env, unsigned int flags)
env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize; env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize;
} }
#ifdef _WIN32
{
HANDLE mh;
LONG sizelo, sizehi;
sizelo = env->me_mapsize & 0xffffffff;
sizehi = env->me_mapsize >> 16; /* pointless on WIN32, only needed on W64 */
sizehi >>= 16;
/* Windows won't create mappings for zero length files.
* Just allocate the maxsize right now.
*/
if (newenv) {
SetFilePointer(env->me_fd, sizelo, sizehi ? &sizehi : NULL, 0);
if (!SetEndOfFile(env->me_fd))
return ErrCode();
SetFilePointer(env->me_fd, 0, NULL, 0);
}
mh = CreateFileMapping(env->me_fd, NULL, PAGE_READONLY,
sizehi, sizelo, NULL);
if (!mh)
return ErrCode();
env->me_map = MapViewOfFileEx(mh, FILE_MAP_READ, 0, 0, env->me_mapsize,
meta.mm_address);
CloseHandle(mh);
if (!env->me_map)
return ErrCode();
}
#else
i = MAP_SHARED; i = MAP_SHARED;
if (meta.mm_address && (flags & MDB_FIXEDMAP)) if (meta.mm_address && (flags & MDB_FIXEDMAP))
i |= MAP_FIXED; i |= MAP_FIXED;
env->me_map = mmap(meta.mm_address, env->me_mapsize, PROT_READ, i, env->me_map = mmap(meta.mm_address, env->me_mapsize, PROT_READ, i,
env->me_fd, 0); env->me_fd, 0);
if (env->me_map == MAP_FAILED) if (env->me_map == MAP_FAILED)
return errno; return ErrCode();
#endif
if (newenv) { if (newenv) {
meta.mm_mapsize = env->me_mapsize; meta.mm_mapsize = env->me_mapsize;
@ -1429,6 +1579,8 @@ mdb_env_open2(MDB_env *env, unsigned int flags)
return MDB_SUCCESS; return MDB_SUCCESS;
} }
#ifndef _WIN32
/* Windows doesn't support destructor callbacks for thread-specific storage */
static void static void
mdb_env_reader_dest(void *ptr) mdb_env_reader_dest(void *ptr)
{ {
@ -1438,12 +1590,12 @@ mdb_env_reader_dest(void *ptr)
reader->mr_pid = 0; reader->mr_pid = 0;
reader->mr_tid = 0; reader->mr_tid = 0;
} }
#endif
/* downgrade the exclusive lock on the region back to shared */ /* downgrade the exclusive lock on the region back to shared */
static void static void
mdb_env_share_locks(MDB_env *env) mdb_env_share_locks(MDB_env *env)
{ {
struct flock lock_info;
int toggle = 0; int toggle = 0;
if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid) if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid)
@ -1451,6 +1603,20 @@ mdb_env_share_locks(MDB_env *env)
env->me_txns->mti_me_toggle = toggle; env->me_txns->mti_me_toggle = toggle;
env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid; env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid;
#ifdef _WIN32
{
OVERLAPPED ov;
/* First acquire a shared lock. The Unlock will
* then release the existing exclusive lock.
*/
memset(&ov, 0, sizeof(ov));
LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov);
UnlockFile(env->me_lfd, 0, 0, 1, 0);
}
#else
{
struct flock lock_info;
/* The shared lock replaces the existing lock */
memset((void *)&lock_info, 0, sizeof(lock_info)); memset((void *)&lock_info, 0, sizeof(lock_info));
lock_info.l_type = F_RDLCK; lock_info.l_type = F_RDLCK;
lock_info.l_whence = SEEK_SET; lock_info.l_whence = SEEK_SET;
@ -1458,23 +1624,50 @@ mdb_env_share_locks(MDB_env *env)
lock_info.l_len = 1; lock_info.l_len = 1;
fcntl(env->me_lfd, F_SETLK, &lock_info); fcntl(env->me_lfd, F_SETLK, &lock_info);
} }
#endif
}
static int static int
mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
{ {
int rc; int rc;
off_t size, rsize; off_t size, rsize;
struct flock lock_info;
*excl = 0; *excl = 0;
if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1) { #ifdef _WIN32
rc = errno; if ((env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE,
FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS,
FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) {
rc = ErrCode();
return rc; return rc;
} }
/* Try to get exclusive lock. If we succeed, then /* Try to get exclusive lock. If we succeed, then
* nobody is using the lock region and we should initialize it. * nobody is using the lock region and we should initialize it.
*/ */
{
if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
*excl = 1;
} else {
OVERLAPPED ov;
memset(&ov, 0, sizeof(ov));
if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
rc = ErrCode();
goto fail;
}
}
}
size = GetFileSize(env->me_lfd, NULL);
#else
if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1) {
rc = ErrCode();
return rc;
}
/* Try to get exclusive lock. If we succeed, then
* nobody is using the lock region and we should initialize it.
*/
{
struct flock lock_info;
memset((void *)&lock_info, 0, sizeof(lock_info)); memset((void *)&lock_info, 0, sizeof(lock_info));
lock_info.l_type = F_WRLCK; lock_info.l_type = F_WRLCK;
lock_info.l_whence = SEEK_SET; lock_info.l_whence = SEEK_SET;
@ -1485,31 +1678,92 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
*excl = 1; *excl = 1;
} else { } else {
lock_info.l_type = F_RDLCK; lock_info.l_type = F_RDLCK;
rc = fcntl(env->me_lfd, F_SETLK, &lock_info); rc = fcntl(env->me_lfd, F_SETLKW, &lock_info);
if (rc) { if (rc) {
rc = errno; rc = ErrCode();
goto fail; goto fail;
} }
} }
}
size = lseek(env->me_lfd, 0, SEEK_END); size = lseek(env->me_lfd, 0, SEEK_END);
#endif
rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
if (size < rsize && *excl) { if (size < rsize && *excl) {
if (ftruncate(env->me_lfd, rsize) != 0) { #ifdef _WIN32
rc = errno; SetFilePointer(env->me_lfd, rsize, NULL, 0);
if (!SetEndOfFile(env->me_lfd)) {
rc = ErrCode();
goto fail; goto fail;
} }
#else
if (ftruncate(env->me_lfd, rsize) != 0) {
rc = ErrCode();
goto fail;
}
#endif
} else { } else {
rsize = size; rsize = size;
size = rsize - sizeof(MDB_txninfo); size = rsize - sizeof(MDB_txninfo);
env->me_maxreaders = size/sizeof(MDB_reader) + 1; env->me_maxreaders = size/sizeof(MDB_reader) + 1;
} }
#ifdef _WIN32
{
HANDLE mh;
mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
0, 0, NULL);
if (!mh) {
rc = ErrCode();
goto fail;
}
env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
CloseHandle(mh);
if (!env->me_txns) {
rc = ErrCode();
goto fail;
}
}
#else
env->me_txns = mmap(0, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, env->me_txns = mmap(0, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
env->me_lfd, 0); env->me_lfd, 0);
if (env->me_txns == MAP_FAILED) { if (env->me_txns == MAP_FAILED) {
rc = errno; rc = ErrCode();
goto fail; goto fail;
} }
#endif
if (*excl) { if (*excl) {
#ifdef _WIN32
char *ptr;
if (!mdb_sec_inited) {
InitializeSecurityDescriptor(&mdb_null_sd,
SECURITY_DESCRIPTOR_REVISION);
SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE);
mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES);
mdb_all_sa.bInheritHandle = FALSE;
mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd;
mdb_sec_inited = 1;
}
/* FIXME: only using up to 20 characters of the env path here,
* probably not enough to assure uniqueness...
*/
sprintf(env->me_txns->mti_rmname, "Global\\MDBr%.20s", lpath);
ptr = env->me_txns->mti_rmname + sizeof("Global\\MDBr");
while ((ptr = strchr(ptr, '\\')))
*ptr++ = '/';
env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname);
if (!env->me_rmutex) {
rc = ErrCode();
goto fail;
}
sprintf(env->me_txns->mti_rmname, "Global\\MDBw%.20s", lpath);
ptr = env->me_txns->mti_rmname + sizeof("Global\\MDBw");
while ((ptr = strchr(ptr, '\\')))
*ptr++ = '/';
env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname);
if (!env->me_wmutex) {
rc = ErrCode();
goto fail;
}
#else
pthread_mutexattr_t mattr; pthread_mutexattr_t mattr;
pthread_mutexattr_init(&mattr); pthread_mutexattr_init(&mattr);
@ -1519,6 +1773,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
} }
pthread_mutex_init(&env->me_txns->mti_mutex, &mattr); pthread_mutex_init(&env->me_txns->mti_mutex, &mattr);
pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr); pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr);
#endif
env->me_txns->mti_version = MDB_VERSION; env->me_txns->mti_version = MDB_VERSION;
env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_magic = MDB_MAGIC;
env->me_txns->mti_txnid = 0; env->me_txns->mti_txnid = 0;
@ -1537,16 +1792,28 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
rc = MDB_VERSION_MISMATCH; rc = MDB_VERSION_MISMATCH;
goto fail; goto fail;
} }
if (errno != EACCES && errno != EAGAIN) { rc = ErrCode();
rc = errno; if (rc != EACCES && rc != EAGAIN) {
goto fail; goto fail;
} }
#ifdef _WIN32
env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname);
if (!env->me_rmutex) {
rc = ErrCode();
goto fail;
}
env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname);
if (!env->me_wmutex) {
rc = ErrCode();
goto fail;
}
#endif
} }
return MDB_SUCCESS; return MDB_SUCCESS;
fail: fail:
close(env->me_lfd); close(env->me_lfd);
env->me_lfd = -1; env->me_lfd = INVALID_HANDLE_VALUE;
return rc; return rc;
} }
@ -1571,25 +1838,50 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode)
if (rc) if (rc)
goto leave; goto leave;
#ifdef _WIN32
if (F_ISSET(flags, MDB_RDONLY)) {
oflags = GENERIC_READ;
len = OPEN_EXISTING;
} else {
oflags = GENERIC_READ|GENERIC_WRITE;
len = OPEN_ALWAYS;
}
mode = FILE_ATTRIBUTE_NORMAL;
if ((env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
NULL, len, mode, NULL)) == INVALID_HANDLE_VALUE) {
rc = ErrCode();
goto leave;
}
#else
if (F_ISSET(flags, MDB_RDONLY)) if (F_ISSET(flags, MDB_RDONLY))
oflags = O_RDONLY; oflags = O_RDONLY;
else else
oflags = O_RDWR | O_CREAT; oflags = O_RDWR | O_CREAT;
if ((env->me_fd = open(dpath, oflags, mode)) == -1) { if ((env->me_fd = open(dpath, oflags, mode)) == -1) {
rc = errno; rc = ErrCode();
goto leave; goto leave;
} }
#endif
if ((rc = mdb_env_open2(env, flags)) == MDB_SUCCESS) { if ((rc = mdb_env_open2(env, flags)) == MDB_SUCCESS) {
/* synchronous fd for meta writes */ /* synchronous fd for meta writes */
#ifdef _WIN32
if (!(flags & (MDB_RDONLY|MDB_NOSYNC)))
mode |= FILE_FLAG_WRITE_THROUGH;
if ((env->me_mfd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
NULL, len, mode, NULL)) == INVALID_HANDLE_VALUE) {
rc = ErrCode();
goto leave;
}
#else
if (!(flags & (MDB_RDONLY|MDB_NOSYNC))) if (!(flags & (MDB_RDONLY|MDB_NOSYNC)))
oflags |= MDB_DSYNC; oflags |= MDB_DSYNC;
if ((env->me_mfd = open(dpath, oflags, mode)) == -1) { if ((env->me_mfd = open(dpath, oflags, mode)) == -1) {
rc = errno; rc = ErrCode();
goto leave; goto leave;
} }
#endif
env->me_path = strdup(path); env->me_path = strdup(path);
DPRINTF("opened dbenv %p", (void *) env); DPRINTF("opened dbenv %p", (void *) env);
pthread_key_create(&env->me_txkey, mdb_env_reader_dest); pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
@ -1604,13 +1896,13 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode)
leave: leave:
if (rc) { if (rc) {
if (env->me_fd >= 0) { if (env->me_fd != INVALID_HANDLE_VALUE) {
close(env->me_fd); close(env->me_fd);
env->me_fd = -1; env->me_fd = INVALID_HANDLE_VALUE;
} }
if (env->me_lfd >= 0) { if (env->me_lfd != INVALID_HANDLE_VALUE) {
close(env->me_lfd); close(env->me_lfd);
env->me_lfd = -1; env->me_lfd = INVALID_HANDLE_VALUE;
} }
} }
free(lpath); free(lpath);
@ -1646,12 +1938,11 @@ mdb_env_close(MDB_env *env)
close(env->me_fd); close(env->me_fd);
if (env->me_txns) { if (env->me_txns) {
pid_t pid = getpid(); pid_t pid = getpid();
size_t size = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
unsigned int i; unsigned int i;
for (i=0; i<env->me_txns->mti_numreaders; i++) for (i=0; i<env->me_txns->mti_numreaders; i++)
if (env->me_txns->mti_readers[i].mr_pid == pid) if (env->me_txns->mti_readers[i].mr_pid == pid)
env->me_txns->mti_readers[i].mr_pid = 0; env->me_txns->mti_readers[i].mr_pid = 0;
munmap(env->me_txns, size); munmap(env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
} }
close(env->me_lfd); close(env->me_lfd);
free(env); free(env);