From 6214423163cb5c5cc7385e9e2f6da45ea5ee61ef Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 5 Aug 2013 09:55:48 +0200 Subject: [PATCH 01/14] Clarify doc: mdb_copy, nested txns, mdb_drop(). mdb_copy: Does not copy lockfile. Can trigger file growth. mdb_txn_begin(): Clarify usage restrictions. mdb_drop(): State what to do rather than what will be done, since closing the handle could otherwise be read as happening even at failure. --- libraries/liblmdb/lmdb.h | 21 ++++++++++++++------- libraries/liblmdb/mdb_copy.1 | 5 +++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index b3cd5ef79e..9e3e5b71f1 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -138,6 +138,7 @@ extern "C" { #endif +/** Unix permissions for creating files, or dummy definition for Windows */ #ifdef _MSC_VER typedef int mdb_mode_t; #else @@ -534,6 +535,10 @@ int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t /** @brief Copy an MDB environment to the specified path. * * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. * @param[in] env An environment handle returned by #mdb_env_create(). It * must have already been opened successfully. * @param[in] path The directory in which the copy will reside. This @@ -546,6 +551,10 @@ int mdb_env_copy(MDB_env *env, const char *path); /** @brief Copy an MDB environment to the specified file descriptor. * * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. * @param[in] env An environment handle returned by #mdb_env_create(). It * must have already been opened successfully. * @param[in] fd The filedescriptor to write the copy to. It must @@ -718,8 +727,8 @@ int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); * @param[in] parent If this parameter is non-NULL, the new transaction * will be a nested transaction, with the transaction indicated by \b parent * as its parent. Transactions may be nested to any level. A parent - * transaction may not issue any other operations besides mdb_txn_begin, - * mdb_txn_abort, or mdb_txn_commit while it has active child transactions. + * transaction and its cursors may not issue any other operations than + * mdb_txn_commit and mdb_txn_abort while it has active child transactions. * @param[in] flags Special options for this transaction. This parameter * must be set to 0 or by bitwise OR'ing together one or more of the * values described here. @@ -909,14 +918,12 @@ int mdb_dbi_flags(MDB_env *env, MDB_dbi dbi, unsigned int *flags); */ void mdb_dbi_close(MDB_env *env, MDB_dbi dbi); - /** @brief Delete a database and/or free all its pages. + /** @brief Empty or delete+close a database. * - * If the \b del parameter is 1, the DB handle will be closed - * and the DB will be deleted. * @param[in] txn A transaction handle returned by #mdb_txn_begin() * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] del 1 to delete the DB from the environment, - * 0 to just free its pages. + * @param[in] del 0 to empty the DB, 1 to delete it from the + * environment and close the DB handle. * @return A non-zero error value on failure and 0 on success. */ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del); diff --git a/libraries/liblmdb/mdb_copy.1 b/libraries/liblmdb/mdb_copy.1 index 7837de5f6b..9b053f97d5 100644 --- a/libraries/liblmdb/mdb_copy.1 +++ b/libraries/liblmdb/mdb_copy.1 @@ -11,6 +11,7 @@ The .B mdb_copy utility copies an LMDB environment. The environment can be copied regardless of whether it is currently in use. +No lockfile is created, since it gets recreated at need. If .I dstpath @@ -22,6 +23,10 @@ written to stdout. Exit status is zero if no errors occur. Errors result in a non-zero exit status and a diagnostic message being written to standard error. +.SH CAVEATS +This utility can trigger significant file size growth if run +in parallel with write transactions, because pages which they +free during copying cannot be reused until the copy is done. .SH "SEE ALSO" .BR mdb_stat (1) .SH AUTHOR From 27435aa5ac14c439ea14c0e7cf0f49c89470bf1e Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 5 Aug 2013 09:55:57 +0200 Subject: [PATCH 02/14] Tweak comments --- libraries/liblmdb/mdb.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 8aeaa9dcd4..b37d113fd4 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -827,13 +827,15 @@ struct MDB_txn { /** The list of pages that became unused during this transaction. */ MDB_IDL mt_free_pgs; - /** The list of dirty pages we temporarily wrote to disk + /** The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. */ MDB_IDL mt_spill_pgs; union { - MDB_ID2L dirty_list; /**< for write txns: modified pages */ - MDB_reader *reader; /**< this thread's reader table slot or NULL */ + /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ + MDB_ID2L dirty_list; + /** For read txns: This thread/txn's reader table slot, or NULL. */ + MDB_reader *reader; } mt_u; /** Array of records for each DB known in the environment. */ MDB_dbx *mt_dbxs; @@ -1267,7 +1269,7 @@ mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) return txn->mt_dbxs[dbi].md_dcmp(a, b); } -/** Allocate a page. +/** Allocate memory for a page. * Re-use old malloc'd pages first for singletons, otherwise just malloc. */ static MDB_page * @@ -1543,9 +1545,14 @@ mdb_page_dirty(MDB_txn *txn, MDB_page *mp) txn->mt_dirty_room--; } -/** Allocate pages for writing. +/** Allocate page numbers and memory for writing. Maintain me_pglast, + * me_pghead and mt_next_pgno. + * * If there are free pages available from older transactions, they - * will be re-used first. Otherwise a new page will be allocated. + * are re-used first. Otherwise allocate a new page at mt_next_pgno. + * Do not modify the freedB, just merge freeDB records into me_pghead[] + * and move me_pglast to say which records were consumed. Only this + * function can create me_pghead and move me_pglast/mt_next_pgno. * @param[in] mc cursor A cursor handle identifying the transaction and * database for which we are allocating. * @param[in] num the number of pages to allocate. @@ -1609,7 +1616,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) mdb_cursor_init(&m2, txn, FREE_DBI, NULL); if (last) { op = MDB_SET_RANGE; - key.mv_data = &last; /* will loop up last+1 */ + key.mv_data = &last; /* will look up last+1 */ key.mv_size = sizeof(last); } if (Paranoid && mc->mc_dbi == FREE_DBI) @@ -1985,6 +1992,7 @@ mdb_cursors_close(MDB_txn *txn, unsigned merge) } mc = bk; } + /* Only malloced cursors are permanently tracked. */ free(mc); } cursors[i] = NULL; @@ -2304,6 +2312,7 @@ mdb_dbis_update(MDB_txn *txn, int keep) /** Common code for #mdb_txn_reset() and #mdb_txn_abort(). * May be called twice for readonly txns: First reset it, then abort. * @param[in] txn the transaction handle to reset + * @param[in] act why the transaction is being reset */ static void mdb_txn_reset0(MDB_txn *txn, const char *act) @@ -3516,7 +3525,7 @@ typedef unsigned long long mdb_hash_t; #define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer - * @param[in] str string to hash + * @param[in] val value to hash * @param[in] hval initial value for hash * @return 64 bit hash * From 74ed1757a4fb9263e1a78adc7b60360531824e0a Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 5 Aug 2013 10:01:39 +0200 Subject: [PATCH 03/14] Silence warnings --- libraries/liblmdb/mdb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index b37d113fd4..4b977038b2 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2999,7 +2999,7 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0); if (!rc) rc = ErrCode(); - else if (len == psize * 2) + else if ((unsigned) len == psize * 2) rc = MDB_SUCCESS; else rc = ENOSPC; @@ -3560,7 +3560,7 @@ mdb_hash_val(MDB_val *val, mdb_hash_t hval) * @param[in] str string to hash * @param[out] encbuf an array of 11 chars to hold the hash */ -const static char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; +static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; static void mdb_pack85(unsigned long l, char *out) From 636c2d2a294ab4d61df832d864317400a5655d60 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 7 Aug 2013 12:42:46 -0700 Subject: [PATCH 04/14] Use proper printf format on Windows --- libraries/liblmdb/mdb.c | 127 ++++++++++++++++++----------------- libraries/liblmdb/mdb_stat.c | 28 +++++--- 2 files changed, 82 insertions(+), 73 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 4b977038b2..9a8521c432 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -162,8 +162,11 @@ #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} #define close(fd) (CloseHandle(fd) ? 0 : -1) #define munmap(ptr,len) UnmapViewOfFile(ptr) +#define Z "I" #else +#define Z "z" + #ifdef MDB_USE_POSIX_SEM #define LOCK_MUTEX_R(env) mdb_sem_wait((env)->me_rmutex) @@ -1166,14 +1169,14 @@ mdb_page_list(MDB_page *mp) DKBUF; nkeys = NUMKEYS(mp); - fprintf(stderr, "Page %zu numkeys %d\n", mp->mp_pgno, nkeys); + fprintf(stderr, "Page %"Z"u numkeys %d\n", mp->mp_pgno, nkeys); for (i=0; imn_ksize; key.mv_data = node->mn_data; nsize = NODESIZE + NODEKSZ(node) + sizeof(indx_t); if (IS_BRANCH(mp)) { - fprintf(stderr, "key %d: page %zu, %s\n", i, NODEPGNO(node), + fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), DKEY(&key)); } else { if (F_ISSET(node->mn_flags, F_BIGDATA)) @@ -1655,10 +1658,10 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) } env->me_pglast = last; #if MDB_DEBUG > 1 - DPRINTF("IDL read txn %zu root %zu num %u", + DPRINTF("IDL read txn %"Z"u root %"Z"u num %u", last, txn->mt_dbs[FREE_DBI].md_root, i); for (k = i; k; k--) - DPRINTF("IDL %zu", idl[k]); + DPRINTF("IDL %"Z"u", idl[k]); #endif /* Merge in descending sorted order */ j = mop_len; @@ -1826,7 +1829,7 @@ mdb_page_touch(MDB_cursor *mc) (rc = mdb_page_alloc(mc, 1, &np))) return rc; pgno = np->mp_pgno; - DPRINTF("touched db %u page %zu -> %zu", mc->mc_dbi,mp->mp_pgno,pgno); + DPRINTF("touched db %u page %"Z"u -> %"Z"u", mc->mc_dbi,mp->mp_pgno,pgno); assert(mp->mp_pgno != pgno); mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); /* Update the parent page, if any, to point to the new page */ @@ -2180,7 +2183,7 @@ mdb_txn_renew(MDB_txn *txn) rc = mdb_txn_renew0(txn); if (rc == MDB_SUCCESS) { - DPRINTF("renew txn %zu%c %p on mdbenv %p, root page %zu", + DPRINTF("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); } @@ -2275,7 +2278,7 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) free(txn); else { *ret = txn; - DPRINTF("begin txn %zu%c %p on mdbenv %p, root page %zu", + DPRINTF("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root); } @@ -2322,7 +2325,7 @@ mdb_txn_reset0(MDB_txn *txn, const char *act) /* Close any DBI handles opened in this txn */ mdb_dbis_update(txn, 0); - DPRINTF("%s txn %zu%c %p on mdbenv %p, root page %zu", + DPRINTF("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); @@ -2465,10 +2468,10 @@ mdb_freelist_save(MDB_txn *txn) #if MDB_DEBUG > 1 { unsigned int i = free_pgs[0]; - DPRINTF("IDL write txn %zu root %zu num %u", + DPRINTF("IDL write txn %"Z"u root %"Z"u num %u", txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); for (; i; i--) - DPRINTF("IDL %zu", free_pgs[i]); + DPRINTF("IDL %"Z"u", free_pgs[i]); } #endif continue; @@ -2607,7 +2610,7 @@ mdb_page_flush(MDB_txn *txn) * the write offset, to at least save the overhead of a Seek * system call. */ - DPRINTF("committing page %zu", pgno); + DPRINTF("committing page %"Z"u", pgno); memset(&ov, 0, sizeof(ov)); ov.Offset = pos & 0xffffffff; ov.OffsetHigh = pos >> 16 >> 16; @@ -2652,7 +2655,7 @@ mdb_page_flush(MDB_txn *txn) wpos = pos; wsize = 0; } - DPRINTF("committing page %zu", pgno); + DPRINTF("committing page %"Z"u", pgno); next_pos = pos + size; iov[n].iov_len = size; iov[n].iov_base = (char *)dp; @@ -2830,7 +2833,7 @@ mdb_txn_commit(MDB_txn *txn) if (!txn->mt_u.dirty_list[0].mid && !(txn->mt_flags & MDB_TXN_DIRTY)) goto done; - DPRINTF("committing txn %zu %p on mdbenv %p, root page %zu", + DPRINTF("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); /* Update DB root pointers */ @@ -2925,7 +2928,7 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) p = (MDB_page *)&pbuf; if (!F_ISSET(p->mp_flags, P_META)) { - DPRINTF("page %zu not a meta page", p->mp_pgno); + DPRINTF("page %"Z"u not a meta page", p->mp_pgno); return MDB_INVALID; } @@ -3030,7 +3033,7 @@ mdb_env_write_meta(MDB_txn *txn) assert(txn->mt_env != NULL); toggle = !txn->mt_toggle; - DPRINTF("writing meta page %d for root page %zu", + DPRINTF("writing meta page %d for root page %"Z"u", toggle, txn->mt_dbs[MAIN_DBI].md_root); env = txn->mt_env; @@ -3328,11 +3331,11 @@ mdb_env_open2(MDB_env *env) env->me_metas[0]->mm_version, env->me_psize); DPRINTF("using meta page %d", toggle); DPRINTF("depth: %u", db->md_depth); - DPRINTF("entries: %zu", db->md_entries); - DPRINTF("branch pages: %zu", db->md_branch_pages); - DPRINTF("leaf pages: %zu", db->md_leaf_pages); - DPRINTF("overflow pages: %zu", db->md_overflow_pages); - DPRINTF("root: %zu", db->md_root); + DPRINTF("entries: %"Z"u", db->md_entries); + DPRINTF("branch pages: %"Z"u", db->md_branch_pages); + DPRINTF("leaf pages: %"Z"u", db->md_leaf_pages); + DPRINTF("overflow pages: %"Z"u", db->md_overflow_pages); + DPRINTF("root: %"Z"u", db->md_root); } #endif @@ -4288,7 +4291,7 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) { pgno_t pgno; COPY_PGNO(pgno, mp->mp_pgno); - DPRINTF("searching %u keys in %s %spage %zu", + DPRINTF("searching %u keys in %s %spage %"Z"u", nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", pgno); } @@ -4340,7 +4343,7 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) DPRINTF("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc); else - DPRINTF("found branch index %u [%s -> %zu], rc = %i", + DPRINTF("found branch index %u [%s -> %"Z"u], rc = %i", i, DKEY(&nodekey), NODEPGNO(node), rc); #endif if (rc == 0) @@ -4395,7 +4398,7 @@ mdb_cursor_pop(MDB_cursor *mc) if (mc->mc_snum) mc->mc_top--; - DPRINTF("popped page %zu off db %u cursor %p", top->mp_pgno, + DPRINTF("popped page %"Z"u off db %u cursor %p", top->mp_pgno, mc->mc_dbi, (void *) mc); } } @@ -4404,7 +4407,7 @@ mdb_cursor_pop(MDB_cursor *mc) static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) { - DPRINTF("pushing page %zu on db %u cursor %p", mp->mp_pgno, + DPRINTF("pushing page %"Z"u on db %u cursor %p", mp->mp_pgno, mc->mc_dbi, (void *) mc); if (mc->mc_snum >= CURSOR_STACK) { @@ -4467,7 +4470,7 @@ mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) level = 0; p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); } else { - DPRINTF("page %zu not found", pgno); + DPRINTF("page %"Z"u not found", pgno); assert(p != NULL); return MDB_PAGE_NOTFOUND; } @@ -4501,9 +4504,9 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) MDB_node *node; indx_t i; - DPRINTF("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp)); + DPRINTF("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp)); assert(NUMKEYS(mp) > 1); - DPRINTF("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0))); + DPRINTF("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0))); if (key == NULL) /* Initialize cursor to first page. */ i = 0; @@ -4550,7 +4553,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) return MDB_CORRUPTED; } - DPRINTF("found leaf page %zu for key [%s]", mp->mp_pgno, + DPRINTF("found leaf page %"Z"u for key [%s]", mp->mp_pgno, key ? DKEY(key) : NULL); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; @@ -4657,7 +4660,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) mc->mc_snum = 1; mc->mc_top = 0; - DPRINTF("db %u root page %zu has flags 0x%X", + DPRINTF("db %u root page %"Z"u has flags 0x%X", mc->mc_dbi, root, mc->mc_pg[0]->mp_flags); if (flags & MDB_PS_MODIFY) { @@ -4680,7 +4683,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) MDB_env *env = txn->mt_env; int rc; - DPRINTF("free ov page %zu (%d)", pg, ovpages); + DPRINTF("free ov page %"Z"u (%d)", pg, ovpages); /* If the page is dirty or on the spill list we just acquired it, * so we should give it back to our current free list, if any. * Not currently supported in nested txns. @@ -4763,7 +4766,7 @@ mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data) data->mv_size = NODEDSZ(leaf); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) { - DPRINTF("read overflow page %zu failed", pgno); + DPRINTF("read overflow page %"Z"u failed", pgno); return rc; } data->mv_data = METADATA(omp); @@ -4815,7 +4818,7 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) } mdb_cursor_pop(mc); - DPRINTF("parent page is page %zu, index %u", + DPRINTF("parent page is page %"Z"u, index %u", mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) @@ -4880,7 +4883,7 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } } - DPRINTF("cursor_next: top page is %zu in cursor %p", mp->mp_pgno, (void *) mc); + DPRINTF("cursor_next: top page is %"Z"u in cursor %p", mp->mp_pgno, (void *) mc); if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { DPUTS("=====> move to next sibling page"); @@ -4889,11 +4892,11 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return rc; } mp = mc->mc_pg[mc->mc_top]; - DPRINTF("next page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); + DPRINTF("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); } else mc->mc_ki[mc->mc_top]++; - DPRINTF("==> cursor points to page %zu with %u keys, key index %u", + DPRINTF("==> cursor points to page %"Z"u with %u keys, key index %u", mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mp)) { @@ -4950,7 +4953,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } } - DPRINTF("cursor_prev: top page is %zu in cursor %p", mp->mp_pgno, (void *) mc); + DPRINTF("cursor_prev: top page is %"Z"u in cursor %p", mp->mp_pgno, (void *) mc); if (mc->mc_ki[mc->mc_top] == 0) { DPUTS("=====> move to prev sibling page"); @@ -4959,13 +4962,13 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } mp = mc->mc_pg[mc->mc_top]; mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; - DPRINTF("prev page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); + DPRINTF("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); } else mc->mc_ki[mc->mc_top]--; mc->mc_flags &= ~C_EOF; - DPRINTF("==> cursor points to page %zu with %u keys, key index %u", + DPRINTF("==> cursor points to page %"Z"u with %u keys, key index %u", mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mp)) { @@ -5491,7 +5494,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return EINVAL; #endif - DPRINTF("==> put db %u key [%s], size %zu, data size %zu", + DPRINTF("==> put db %u key [%s], size %"Z"u, data size %"Z"u", mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size); dkey.mv_size = 0; @@ -5988,7 +5991,7 @@ mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) if ((rc = mdb_page_alloc(mc, num, &np))) return rc; - DPRINTF("allocated new mpage %zu, page size %u", + DPRINTF("allocated new mpage %"Z"u, page size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize); np->mp_flags = flags | P_DIRTY; np->mp_lower = PAGEHDRSZ; @@ -6087,7 +6090,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, assert(mp->mp_upper >= mp->mp_lower); - DPRINTF("add to %s %spage %zu index %i, data size %zu key size %zu [%s]", + DPRINTF("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, data ? data->mv_size : 0, @@ -6121,12 +6124,12 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); int rc; /* Put data on overflow page. */ - DPRINTF("data size is %zu, node would be %zu, put data on overflow page", + DPRINTF("data size is %"Z"u, node would be %"Z"u, put data on overflow page", data->mv_size, node_size+data->mv_size); node_size += sizeof(pgno_t); if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) return rc; - DPRINTF("allocated overflow page %zu", ofp->mp_pgno); + DPRINTF("allocated overflow page %"Z"u", ofp->mp_pgno); flags |= F_BIGDATA; } else { node_size += data->mv_size; @@ -6135,11 +6138,11 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, node_size += node_size & 1; if (node_size + sizeof(indx_t) > SIZELEFT(mp)) { - DPRINTF("not enough room in page %zu, got %u ptrs", + DPRINTF("not enough room in page %"Z"u, got %u ptrs", mp->mp_pgno, NUMKEYS(mp)); DPRINTF("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower, mp->mp_upper - mp->mp_lower); - DPRINTF("node size = %zu", node_size); + DPRINTF("node size = %"Z"u", node_size); return MDB_PAGE_FULL; } @@ -6208,7 +6211,7 @@ mdb_node_del(MDB_page *mp, indx_t indx, int ksize) { pgno_t pgno; COPY_PGNO(pgno, mp->mp_pgno); - DPRINTF("delete node %u on %s page %zu", indx, + DPRINTF("delete node %u on %s page %"Z"u", indx, IS_LEAF(mp) ? "leaf" : "branch", pgno); } #endif @@ -6370,7 +6373,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) mx->mx_db.md_flags |= MDB_INTEGERKEY; } } - DPRINTF("Sub-db %u for db %u root page %zu", mx->mx_cursor.mc_dbi, mc->mc_dbi, + DPRINTF("Sub-db %u for db %u root page %"Z"u", mx->mx_cursor.mc_dbi, mc->mc_dbi, mx->mx_db.md_root); mx->mx_dbflag = DB_VALID | (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY) ? DB_DIRTY : 0); @@ -6537,7 +6540,7 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key) char kbuf2[(MDB_MAXKEYSIZE*2+1)]; k2.mv_data = NODEKEY(node); k2.mv_size = node->mn_ksize; - DPRINTF("update key %u (ofs %u) [%s] to [%s] on page %zu", + DPRINTF("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", indx, ptr, mdb_dkey(&k2, kbuf2), DKEY(key), @@ -6665,7 +6668,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) return rc; } - DPRINTF("moving %s node %u [%s] on page %zu to node %u on page %zu", + DPRINTF("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", csrc->mc_ki[csrc->mc_top], DKEY(&key), @@ -6716,7 +6719,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - DPRINTF("update separator for source page %zu to [%s]", + DPRINTF("update separator for source page %"Z"u to [%s]", csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)); mdb_cursor_copy(csrc, &mn); mn.mc_snum--; @@ -6744,7 +6747,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - DPRINTF("update separator for destination page %zu to [%s]", + DPRINTF("update separator for destination page %"Z"u to [%s]", cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)); mdb_cursor_copy(cdst, &mn); mn.mc_snum--; @@ -6782,7 +6785,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) MDB_val key, data; unsigned nkeys; - DPRINTF("merging page %zu into %zu", csrc->mc_pg[csrc->mc_top]->mp_pgno, + DPRINTF("merging page %"Z"u into %"Z"u", csrc->mc_pg[csrc->mc_top]->mp_pgno, cdst->mc_pg[cdst->mc_top]->mp_pgno); assert(csrc->mc_snum > 1); /* can't merge root page */ @@ -6835,7 +6838,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) } } - DPRINTF("dst page %zu now has %u keys (%.1f%% filled)", + DPRINTF("dst page %"Z"u now has %u keys (%.1f%% filled)", cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]), (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10); /* Unlink the src page from parent and add to free list. @@ -6926,7 +6929,7 @@ mdb_rebalance(MDB_cursor *mc) { pgno_t pgno; COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno); - DPRINTF("rebalancing %s page %zu (has %u keys, %.1f%% full)", + DPRINTF("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", pgno, NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10); } @@ -6937,7 +6940,7 @@ mdb_rebalance(MDB_cursor *mc) #if MDB_DEBUG pgno_t pgno; COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno); - DPRINTF("no need to rebalance page %zu, above fill threshold", + DPRINTF("no need to rebalance page %"Z"u, above fill threshold", pgno); #endif return MDB_SUCCESS; @@ -7058,7 +7061,7 @@ mdb_rebalance(MDB_cursor *mc) mc->mc_ki[mc->mc_top] = 0; } - DPRINTF("found neighbor page %zu (%u keys, %.1f%% full)", + DPRINTF("found neighbor page %"Z"u (%u keys, %.1f%% full)", mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10); /* If the neighbor page is above threshold and has enough keys, @@ -7219,14 +7222,14 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mp = mc->mc_pg[mc->mc_top]; newindx = mc->mc_ki[mc->mc_top]; - DPRINTF("-----> splitting %s page %zu and adding [%s] at index %i", + DPRINTF("-----> splitting %s page %"Z"u and adding [%s] at index %i", IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, DKEY(newkey), mc->mc_ki[mc->mc_top]); /* Create a right sibling. */ if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) return rc; - DPRINTF("new right sibling: page %zu", rp->mp_pgno); + DPRINTF("new right sibling: page %"Z"u", rp->mp_pgno); if (mc->mc_snum < 2) { if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) @@ -7237,7 +7240,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; - DPRINTF("root split! new root = %zu", pp->mp_pgno); + DPRINTF("root split! new root = %"Z"u", pp->mp_pgno); mc->mc_db->md_depth++; new_root = 1; @@ -7255,7 +7258,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno ptop = 0; } else { ptop = mc->mc_top-1; - DPRINTF("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno); + DPRINTF("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno); } mc->mc_flags |= C_SPLITTING; @@ -8059,9 +8062,9 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) int rc; tid = mr[i].mr_tid; if (mr[i].mr_txnid == (txnid_t)-1) { - sprintf(buf, "%10d %zx -\n", mr[i].mr_pid, tid); + sprintf(buf, "%10d %"Z"x -\n", mr[i].mr_pid, tid); } else { - sprintf(buf, "%10d %zx %zu\n", mr[i].mr_pid, tid, mr[i].mr_txnid); + sprintf(buf, "%10d %"Z"x %"Z"u\n", mr[i].mr_pid, tid, mr[i].mr_txnid); } if (first) { first = 0; diff --git a/libraries/liblmdb/mdb_stat.c b/libraries/liblmdb/mdb_stat.c index aaad2d75a3..aeb573a495 100644 --- a/libraries/liblmdb/mdb_stat.c +++ b/libraries/liblmdb/mdb_stat.c @@ -17,16 +17,22 @@ #include #include "lmdb.h" +#ifdef _WIN32 +#define Z "I" +#else +#define Z "z" +#endif + static void prstat(MDB_stat *ms) { #if 0 printf(" Page size: %u\n", ms->ms_psize); #endif printf(" Tree depth: %u\n", ms->ms_depth); - printf(" Branch pages: %zu\n", ms->ms_branch_pages); - printf(" Leaf pages: %zu\n", ms->ms_leaf_pages); - printf(" Overflow pages: %zu\n", ms->ms_overflow_pages); - printf(" Entries: %zu\n", ms->ms_entries); + printf(" Branch pages: %"Z"u\n", ms->ms_branch_pages); + printf(" Leaf pages: %"Z"u\n", ms->ms_leaf_pages); + printf(" Overflow pages: %"Z"u\n", ms->ms_overflow_pages); + printf(" Entries: %"Z"u\n", ms->ms_entries); } static void usage(char *prog) @@ -110,11 +116,11 @@ int main(int argc, char *argv[]) rc = mdb_env_info(env, &mei); printf("Environment Info\n"); printf(" Map address: %p\n", mei.me_mapaddr); - printf(" Map size: %zu\n", mei.me_mapsize); + printf(" Map size: %"Z"u\n", mei.me_mapsize); printf(" Page size: %u\n", mst.ms_psize); - printf(" Max pages: %zu\n", mei.me_mapsize / mst.ms_psize); - printf(" Number of pages used: %zu\n", mei.me_last_pgno+1); - printf(" Last transaction ID: %zu\n", mei.me_last_txnid); + printf(" Max pages: %"Z"u\n", mei.me_mapsize / mst.ms_psize); + printf(" Number of pages used: %"Z"u\n", mei.me_last_pgno+1); + printf(" Last transaction ID: %"Z"u\n", mei.me_last_txnid); printf(" Max readers: %u\n", mei.me_maxreaders); printf(" Number of readers used: %u\n", mei.me_numreaders); } @@ -172,20 +178,20 @@ int main(int argc, char *argv[]) pg += span; for (; i >= span && iptr[i-span] == pg; span++, pg++) ; } - printf(" Transaction %zu, %zd pages, maxspan %zd%s\n", + printf(" Transaction %"Z"u, %"Z"d pages, maxspan %"Z"d%s\n", *(size_t *)key.mv_data, j, span, bad); if (freinfo > 2) { for (--j; j >= 0; ) { pg = iptr[j]; for (span=1; --j >= 0 && iptr[j] == pg+span; span++) ; - printf(span>1 ? " %9zu[%zd]\n" : " %9zu\n", + printf(span>1 ? " %9"Z"u[%"Z"d]\n" : " %9"Z"u\n", pg, span); } } } } mdb_cursor_close(cursor); - printf(" Free pages: %zu\n", pages); + printf(" Free pages: %"Z"u\n", pages); } rc = mdb_open(txn, subname, 0, &dbi); From 1878213092fbc60f42131318f9eb98a73d5e888c Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 8 Aug 2013 19:43:04 +0200 Subject: [PATCH 05/14] Fix mdb_reader_pid(). Treat unexpected errors as "don't know". Invert Pidcheck return value, so nonzero including error codes = "the process may exist". On Windows: Catch exited but still existing processes. Handle undefined PROCESS_QUERY_LIMITED_INFORMATION. On Unix: don't trust F_GETLK error to leave the input alone, the fcntl() doc seems unclear. --- libraries/liblmdb/mdb.c | 58 ++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 9a8521c432..1c60e0c04f 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -162,6 +162,11 @@ #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} #define close(fd) (CloseHandle(fd) ? 0 : -1) #define munmap(ptr,len) UnmapViewOfFile(ptr) +#ifndef PROCESS_QUERY_LIMITED_INFORMATION +#define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION +#else +#define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 +#endif #define Z "I" #else @@ -2019,7 +2024,8 @@ enum Pidlock_op { #endif /** Set or check a pid lock. Set returns 0 on success. - * Check returns 0 if lock exists (meaning the process is alive). + * Check returns 0 if the process is certainly dead, nonzero if it may + * be alive (the lock exists or an error happened so we do not know). * * On Windows Pidset is a no-op, we merely check for the existence * of the process with the given pid. On POSIX we use a single byte @@ -2029,32 +2035,35 @@ static int mdb_reader_pid(MDB_env *env, enum Pidlock_op op, pid_t pid) { #ifdef _WIN32 + int ret = 0; HANDLE h; - int ver, query; - switch(op) { - case Pidset: - break; - case Pidcheck: + if (op == Pidcheck) { h = OpenProcess(env->me_pidquery, FALSE, pid); + /* No documented "no such process" code, but other program use this: */ if (!h) - return GetLastError(); + return ErrCode() != ERROR_INVALID_PARAMETER; + /* A process exists until all handles to it close. Has it exited? */ + ret = WaitForSingleObject(h, 0) != 0; CloseHandle(h); - break; } - return 0; + return ret; #else - int rc; - struct flock lock_info; - memset((void *)&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_WRLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = pid; - lock_info.l_len = 1; - while ((rc = fcntl(env->me_lfd, op, &lock_info)) && - (rc = ErrCode()) == EINTR) ; - if (op == F_GETLK && rc == 0 && lock_info.l_type == F_UNLCK) - rc = -1; - return rc; + for (;;) { + int rc; + struct flock lock_info; + memset(&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = pid; + lock_info.l_len = 1; + if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { + if (op == F_GETLK && lock_info.l_type != F_UNLCK) + rc = -1; + } else if ((rc = ErrCode()) == EINTR) { + continue; + } + return rc; + } #endif } @@ -3247,7 +3256,7 @@ mdb_env_open2(MDB_env *env) /* See if we should use QueryLimited */ rc = GetVersion(); if ((rc & 0xff) > 5) - env->me_pidquery = PROCESS_QUERY_LIMITED_INFORMATION; + env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; else env->me_pidquery = PROCESS_QUERY_INFORMATION; @@ -8144,9 +8153,10 @@ int mdb_reader_check(MDB_env *env, int *dead) if (mr[i].mr_pid && mr[i].mr_pid != env->me_pid) { pid = mr[i].mr_pid; if (mdb_pid_insert(pids, pid) == 0) { - if (mdb_reader_pid(env, Pidcheck, pid)) { + if (!mdb_reader_pid(env, Pidcheck, pid)) { LOCK_MUTEX_R(env); - if (mdb_reader_pid(env, Pidcheck, pid)) { + /* Recheck, a new process may have reused pid */ + if (!mdb_reader_pid(env, Pidcheck, pid)) { for (j=i; j Date: Thu, 8 Aug 2013 19:43:04 +0200 Subject: [PATCH 06/14] MDB_LOCK_VERSION -> MDB_LOCK_FORMAT. Pid locking needs a different lockfile-version: MDB_env's with and without pid locking must not coexist, they can sabotage each other. Store MDB_LOCK_FORMAT = (version | "use locking" flag) instead. --- libraries/liblmdb/mdb.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 1c60e0c04f..345d6d61f9 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -140,6 +140,7 @@ * @{ */ #ifdef _WIN32 +#define MDB_PIDLOCK 0 #define pthread_t DWORD #define pthread_mutex_t HANDLE #define pthread_key_t DWORD @@ -172,6 +173,9 @@ #define Z "z" + /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ +#define MDB_PIDLOCK 1 + #ifdef MDB_USE_POSIX_SEM #define LOCK_MUTEX_R(env) mdb_sem_wait((env)->me_rmutex) @@ -523,8 +527,8 @@ typedef struct MDB_txbody { /** Stamp identifying this as an MDB file. It must be set * to #MDB_MAGIC. */ uint32_t mtb_magic; - /** Version number of this lock file. Must be set to #MDB_LOCK_VERSION. */ - uint32_t mtb_version; + /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ + uint32_t mtb_format; #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) char mtb_rmname[MNAME_LEN]; #else @@ -550,7 +554,7 @@ typedef struct MDB_txninfo { union { MDB_txbody mtb; #define mti_magic mt1.mtb.mtb_magic -#define mti_version mt1.mtb.mtb_version +#define mti_format mt1.mtb.mtb_format #define mti_mutex mt1.mtb.mtb_mutex #define mti_rmname mt1.mtb.mtb_rmname #define mti_txnid mt1.mtb.mtb_txnid @@ -569,6 +573,13 @@ typedef struct MDB_txninfo { } mt2; MDB_reader mti_readers[1]; } MDB_txninfo; + + /** Lockfile format signature: version, features and field layout */ +#define MDB_LOCK_FORMAT \ + ((uint32_t) \ + ((MDB_LOCK_VERSION) \ + /* Flags which describe functionality */ \ + + (((MDB_PIDLOCK) != 0) << 16))) /** @} */ /** Common header for all page types. @@ -2013,7 +2024,7 @@ mdb_cursors_close(MDB_txn *txn, unsigned merge) static void mdb_txn_reset0(MDB_txn *txn, const char *act); -#ifdef _WIN32 +#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ enum Pidlock_op { Pidset, Pidcheck }; @@ -2034,7 +2045,7 @@ enum Pidlock_op { static int mdb_reader_pid(MDB_env *env, enum Pidlock_op op, pid_t pid) { -#ifdef _WIN32 +#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ int ret = 0; HANDLE h; if (op == Pidcheck) { @@ -3776,8 +3787,8 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) pthread_mutexattr_destroy(&mattr); #endif /* _WIN32 || MDB_USE_POSIX_SEM */ - env->me_txns->mti_version = MDB_LOCK_VERSION; env->me_txns->mti_magic = MDB_MAGIC; + env->me_txns->mti_format = MDB_LOCK_FORMAT; env->me_txns->mti_txnid = 0; env->me_txns->mti_numreaders = 0; @@ -3787,9 +3798,9 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) rc = MDB_INVALID; goto fail; } - if (env->me_txns->mti_version != MDB_LOCK_VERSION) { - DPRINTF("lock region is version %u, expected version %u", - env->me_txns->mti_version, MDB_LOCK_VERSION); + if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { + DPRINTF("lock region has format+version 0x%x, expected 0x%x", + env->me_txns->mti_format, MDB_LOCK_FORMAT); rc = MDB_VERSION_MISMATCH; goto fail; } From ffd2287b5505dd56a1ed12bf5f5fb903ff2fc8ea Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 8 Aug 2013 19:54:54 +0200 Subject: [PATCH 07/14] Factor out MDB_env. --- libraries/liblmdb/mdb.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 345d6d61f9..c2e75e54aa 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4452,12 +4452,11 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) { + MDB_env *env = txn->mt_env; MDB_page *p = NULL; int level; - if (!((txn->mt_flags & MDB_TXN_RDONLY) | - (txn->mt_env->me_flags & MDB_WRITEMAP))) - { + if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) { MDB_txn *tx2 = txn; level = 1; do { @@ -4471,7 +4470,7 @@ mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) if (tx2->mt_spill_pgs) { x = mdb_midl_search(tx2->mt_spill_pgs, pgno); if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pgno) { - p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); + p = (MDB_page *)(env->me_map + env->me_psize * pgno); goto done; } } @@ -4488,7 +4487,7 @@ mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) if (pgno < txn->mt_next_pgno) { level = 0; - p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); + p = (MDB_page *)(env->me_map + env->me_psize * pgno); } else { DPRINTF("page %"Z"u not found", pgno); assert(p != NULL); From ee5ba855651df54130a53daa8f5b638894343230 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 8 Aug 2013 19:57:51 +0200 Subject: [PATCH 08/14] Set MDB_TXN_ERROR when inconsistent txn state --- libraries/liblmdb/mdb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index c2e75e54aa..aeaf4dfd8f 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1495,7 +1495,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) continue; } if ((rc = mdb_midl_append(&txn->mt_spill_pgs, dl[i].mid))) - return rc; + goto done; } mdb_midl_sort(txn->mt_spill_pgs); @@ -1503,6 +1503,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) mdb_cursorpages_mark(m0, P_DIRTY|P_KEEP); +done: if (rc == 0) { if (txn->mt_parent) { MDB_txn *tx2; @@ -1525,6 +1526,8 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) txn->mt_dirty_room = MDB_IDL_UM_MAX - dl[0].mid; } txn->mt_flags |= MDB_TXN_SPILLS; + } else { + txn->mt_flags |= MDB_TXN_ERROR; } return rc; } From c3547e81f3fcefac2dc66f8dfff64025eb41cc36 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 8 Aug 2013 19:57:51 +0200 Subject: [PATCH 09/14] Fix page spilling when MDB_WRITEMAP. mdb_page_spill(): Don't binary-search the unsorted dirty_list. mdb_page_flush(): Don't overwrite unprocessed dirty_list items. --- libraries/liblmdb/mdb.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index aeaf4dfd8f..46df8005bc 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1434,7 +1434,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) MDB_page *dp; MDB_ID2L dl = txn->mt_u.dirty_list; unsigned int i, j; - int rc; + int rc, level; if (m0->mc_flags & C_SUB) return MDB_SUCCESS; @@ -1461,11 +1461,13 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) /* Mark all the dirty root pages we want to preserve */ for (i=0; imt_numdbs; i++) { if (txn->mt_dbflags[i] & DB_DIRTY) { - j = mdb_mid2l_search(dl, txn->mt_dbs[i].md_root); - if (j <= dl[0].mid) { - dp = dl[j].mptr; + pgno_t pgno = txn->mt_dbs[i].md_root; + if (pgno == P_INVALID) + continue; + if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) + goto done; + if ((dp->mp_flags & P_DIRTY) && level <= 1) dp->mp_flags |= P_KEEP; - } } } @@ -2592,7 +2594,7 @@ mdb_page_flush(MDB_txn *txn) j = 0; if (env->me_flags & MDB_WRITEMAP) { /* Clear dirty flags */ - for (i = pagecount; i; i--) { + for (i=1; i<=pagecount; i++) { dp = dl[i].mptr; /* Don't flush this page yet */ if (dp->mp_flags & P_KEEP) { From 2bd5d8102eddf460e288ccd4bd556ebba133b4b2 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 8 Aug 2013 19:57:52 +0200 Subject: [PATCH 10/14] Fix mdb_ovpage_free() vs. spill. Ensure me_pghead has room before removing from spill/dirty list. Don't return pages to me_pghead in nested txns, use mt_free_pgs. --- libraries/liblmdb/mdb.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 46df8005bc..d1e12554cc 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4703,33 +4703,38 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) { MDB_txn *txn = mc->mc_txn; pgno_t pg = mp->mp_pgno; - unsigned i, ovpages = mp->mp_pages; + unsigned x = 0, ovpages = mp->mp_pages; MDB_env *env = txn->mt_env; + MDB_IDL sl = txn->mt_spill_pgs; int rc; DPRINTF("free ov page %"Z"u (%d)", pg, ovpages); /* If the page is dirty or on the spill list we just acquired it, * so we should give it back to our current free list, if any. - * Not currently supported in nested txns. * Otherwise put it onto the list of pages we freed in this txn. + * + * Won't create me_pghead: me_pglast must be inited along with it. + * Unsupported in nested txns: They would need to hide the page + * range in ancestor txns' dirty and spilled lists. */ - if (!(mp->mp_flags & P_DIRTY) && txn->mt_spill_pgs) { - unsigned x = mdb_midl_search(txn->mt_spill_pgs, pg); - if (x <= txn->mt_spill_pgs[0] && txn->mt_spill_pgs[x] == pg) { - /* This page is no longer spilled */ - for (; x < txn->mt_spill_pgs[0]; x++) - txn->mt_spill_pgs[x] = txn->mt_spill_pgs[x+1]; - txn->mt_spill_pgs[0]--; - goto release; - } - } - if ((mp->mp_flags & P_DIRTY) && !txn->mt_parent && env->me_pghead) { - unsigned j, x; + if (env->me_pghead && + !txn->mt_parent && + ((mp->mp_flags & P_DIRTY) || + (sl && (x = mdb_midl_search(sl, pg)) <= sl[0] && sl[x] == pg))) + { + unsigned i, j; pgno_t *mop; MDB_ID2 *dl, ix, iy; rc = mdb_midl_need(&env->me_pghead, ovpages); if (rc) return rc; + if (!(mp->mp_flags & P_DIRTY)) { + /* This page is no longer spilled */ + for (; x < sl[0]; x++) + sl[x] = sl[x+1]; + sl[0]--; + goto release; + } /* Remove from dirty list */ dl = txn->mt_u.dirty_list; x = dl[0].mid--; From 38b11b1de14a16eab461a12fcda9d78f406883a7 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 8 Aug 2013 19:57:52 +0200 Subject: [PATCH 11/14] mdb_cursorpages_mark: Mark current txn and no more. Ignore parent txn cursors since it is the current txn's dirty_list which will be flushed. But check the current txn also when clearing, since cursors can have pages which are dirty in a parent. Check !mc_xcursor instead of !MDB_DUPSORT. Equivalent for valid data, but a bit safer if the sub-DB flags are corrupt. --- libraries/liblmdb/mdb.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index d1e12554cc..386a1dcc8c 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1352,9 +1352,7 @@ mdb_dlist_free(MDB_txn *txn) dl[0].mid = 0; } -/* Set or clear P_KEEP in non-overflow, non-sub pages in known cursors. - * When clearing, only consider backup cursors (from parent txns) since - * other P_KEEP flags have already been cleared. +/* Set or clear P_KEEP in non-overflow, non-sub pages in this txn's cursors. * @param[in] mc A cursor handle for the current operation. * @param[in] pflags Flags of the pages to update: * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. @@ -1363,7 +1361,7 @@ static void mdb_cursorpages_mark(MDB_cursor *mc, unsigned pflags) { MDB_txn *txn = mc->mc_txn; - MDB_cursor *m2, *m3; + MDB_cursor *m3; MDB_xcursor *mx; unsigned i, j; @@ -1371,18 +1369,14 @@ mdb_cursorpages_mark(MDB_cursor *mc, unsigned pflags) mc = NULL; /* will find mc in mt_cursors */ for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { for (; mc; mc=mc->mc_next) { - m2 = pflags == P_DIRTY ? mc : mc->mc_backup; - for (; m2; m2 = m2->mc_backup) { - for (m3=m2; m3->mc_flags & C_INITIALIZED; m3=&mx->mx_cursor) { + for (m3 = mc; m3->mc_flags & C_INITIALIZED; m3 = &mx->mx_cursor) { for (j=0; jmc_snum; j++) if ((m3->mc_pg[j]->mp_flags & (P_SUBP|P_DIRTY|P_KEEP)) == pflags) m3->mc_pg[j]->mp_flags ^= P_KEEP; - if (!(m3->mc_db->md_flags & MDB_DUPSORT)) + mx = m3->mc_xcursor; + if (mx == NULL) break; - /* Cursor backups have mx malloced at the end of m2 */ - mx = (m3 == mc ? m3->mc_xcursor : (MDB_xcursor *)(m3+1)); - } } } if (i == 0) From 22c104c5d7783c5fda8b354e68302f7f54a3dddf Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Fri, 9 Aug 2013 12:54:42 +0200 Subject: [PATCH 12/14] Re-fix reader-pid code --- libraries/liblmdb/mdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 386a1dcc8c..f56277ecb1 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -163,7 +163,7 @@ #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} #define close(fd) (CloseHandle(fd) ? 0 : -1) #define munmap(ptr,len) UnmapViewOfFile(ptr) -#ifndef PROCESS_QUERY_LIMITED_INFORMATION +#ifdef PROCESS_QUERY_LIMITED_INFORMATION #define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION #else #define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 From 1ecd86b14cf94381bcb96ae6e129d6a5bef7fc9d Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Fri, 9 Aug 2013 13:05:14 +0200 Subject: [PATCH 13/14] Replace unpredictable EINVAL error returns. Return EINVAL only for simple programmer errors. --- libraries/liblmdb/lmdb.h | 6 +++++- libraries/liblmdb/mdb.c | 24 ++++++++++++++---------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 9e3e5b71f1..9019b31bdc 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -387,7 +387,11 @@ typedef enum MDB_cursor_op { #define MDB_INCOMPATIBLE (-30784) /** Invalid reuse of reader locktable slot */ #define MDB_BAD_RSLOT (-30783) -#define MDB_LAST_ERRCODE MDB_BAD_RSLOT + /** Transaction cannot recover - it must be aborted */ +#define MDB_BAD_TXN (-30782) + /** Too big key/data, key is empty, or wrong DUPFIXED size */ +#define MDB_BAD_VALSIZE (-30781) +#define MDB_LAST_ERRCODE MDB_BAD_VALSIZE /** @} */ /** @brief Statistics for a database in the environment */ diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index f56277ecb1..0c53766e3f 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1127,6 +1127,8 @@ static char *const mdb_errstr[] = { "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", "MDB_INCOMPATIBLE: Database flags changed or would change", "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", + "MDB_BAD_TXN: Transaction cannot recover - it must be aborted", + "MDB_BAD_VALSIZE: Too big key/data, key is empty, or wrong DUPFIXED size", }; char * @@ -2729,7 +2731,7 @@ mdb_txn_commit(MDB_txn *txn) DPUTS("error flag is set, can't commit"); if (txn->mt_parent) txn->mt_parent->mt_flags |= MDB_TXN_ERROR; - rc = EINVAL; + rc = MDB_BAD_TXN; goto fail; } @@ -4624,7 +4626,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) */ if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) { DPUTS("transaction has failed, must abort"); - return EINVAL; + return MDB_BAD_TXN; } else { /* Make sure we're using an up-to-date root */ if (mc->mc_dbi > MAIN_DBI) { @@ -4814,7 +4816,7 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, return EINVAL; if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { - return EINVAL; + return MDB_BAD_VALSIZE; } mdb_cursor_init(&mc, txn, dbi, &mx); @@ -5344,8 +5346,10 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, case MDB_SET: case MDB_SET_KEY: case MDB_SET_RANGE: - if (key == NULL || key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { + if (key == NULL) { rc = EINVAL; + } else if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { + rc = MDB_BAD_VALSIZE; } else if (op == MDB_SET_RANGE) rc = mdb_cursor_set(mc, key, data, op, NULL); else @@ -5507,14 +5511,14 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return EACCES; if (flags != MDB_CURRENT && (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE)) - return EINVAL; + return MDB_BAD_VALSIZE; if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT) && data->mv_size > MDB_MAXKEYSIZE) - return EINVAL; + return MDB_BAD_VALSIZE; #if SIZE_MAX > MAXDATASIZE if (data->mv_size > MAXDATASIZE) - return EINVAL; + return MDB_BAD_VALSIZE; #endif DPRINTF("==> put db %u key [%s], size %"Z"u, data size %"Z"u", @@ -5599,7 +5603,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { unsigned int ksize = mc->mc_db->md_pad; if (key->mv_size != ksize) - return EINVAL; + return MDB_BAD_VALSIZE; if (flags == MDB_CURRENT) { char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); memcpy(ptr, key->mv_data, ksize); @@ -7182,7 +7186,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi, } if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { - return EINVAL; + return MDB_BAD_VALSIZE; } mdb_cursor_init(&mc, txn, dbi, &mx); @@ -7646,7 +7650,7 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, } if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { - return EINVAL; + return MDB_BAD_VALSIZE; } if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) From 1ac3c8308f78269b31de0bd9e6cd99dcda7b48d1 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Fri, 9 Aug 2013 04:51:33 -0700 Subject: [PATCH 14/14] Add mdb_env_get_maxkeysize() --- libraries/liblmdb/lmdb.h | 7 +++++++ libraries/liblmdb/mdb.c | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 9019b31bdc..8bd341cca1 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -720,6 +720,13 @@ int mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers); */ int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); + /** @brief Get the maximum size of a key for the environment. + * + * @param[in] env An environment handle returned by #mdb_env_create() + * @return The maximum size of a key. (#MDB_MAXKEYSIZE) + */ +int mdb_env_get_maxkeysize(MDB_env *env); + /** @brief Create a transaction for use with the environment. * * The transaction handle may be discarded using #mdb_txn_abort() or #mdb_txn_commit(). diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 0c53766e3f..ca7233466b 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -8069,6 +8069,11 @@ int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) return MDB_SUCCESS; } +int mdb_env_get_maxkeysize(MDB_env *env) +{ + return MDB_MAXKEYSIZE; +} + int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { unsigned int i, rdrs;