Merge branch 'mdb.master' of ssh://git-master.openldap.org/~git/git/openldap

This commit is contained in:
Howard Chu 2011-09-09 22:29:37 -07:00
commit 8623c98726
2 changed files with 118 additions and 34 deletions

View File

@ -316,8 +316,8 @@ typedef uint16_t indx_t;
* Since the database uses multi-version concurrency control, readers don't
* actually need any locking. This table is used to keep track of which
* readers are using data from which old transactions, so that we'll know
* when a particular old transaction is no longer in use, Old transactions
* that have freed any data pages can then have their freed pages reclaimed
* when a particular old transaction is no longer in use. Old transactions
* that have discarded any data pages can then have those pages reclaimed
* for use by a later write transaction.
*
* The lock table is constructed such that reader slots are aligned with the
@ -469,10 +469,12 @@ typedef struct MDB_txninfo {
* headers on any page after the first.
*/
typedef struct MDB_page {
union {
pgno_t mp_pgno; /**< page number */
void * mp_next; /**< for in-memory list of freed structs */
};
#define mp_pgno mp_p.p_pgno
#define mp_next mp_p.p_next
union padded {
pgno_t p_pgno; /**< page number */
void * p_next; /**< for in-memory list of freed structs */
} mp_p;
#define P_BRANCH 0x01 /**< branch page */
#define P_LEAF 0x02 /**< leaf page */
#define P_OVERFLOW 0x04 /**< overflow page */
@ -480,13 +482,16 @@ typedef struct MDB_page {
#define P_DIRTY 0x10 /**< dirty page */
#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
uint32_t mp_flags;
union {
#define mp_lower mp_pb.pb.pb_lower
#define mp_upper mp_pb.pb.pb_upper
#define mp_pages mp_pb.pb_pages
union page_bounds {
struct {
indx_t mp_lower; /**< lower bound of free space */
indx_t mp_upper; /**< upper bound of free space */
};
uint32_t mp_pages; /**< number of overflow pages */
};
indx_t pb_lower; /**< lower bound of free space */
indx_t pb_upper; /**< upper bound of free space */
} pb;
uint32_t pb_pages; /**< number of overflow pages */
} mp_pb;
indx_t mp_ptrs[1]; /**< dynamic size */
} MDB_page;
@ -543,17 +548,17 @@ typedef struct MDB_node {
/** Size of the node header, excluding dynamic data at the end */
#define NODESIZE offsetof(MDB_node, mn_data)
/** Size of a node in a branch page.
/** Size of a node in a branch page with a given key.
* This is just the node header plus the key, there is no data.
*/
#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
/** Size of a node in a leaf page.
/** Size of a node in a leaf page with a given key and data.
* This is node header plus key plus data size.
*/
#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size)
/** Address of node \i in page \p */
/** Address of node \b i in page \b p */
#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
/** Address of the key for the node */
@ -748,6 +753,7 @@ struct MDB_env {
HANDLE me_fd; /**< The main data file */
HANDLE me_lfd; /**< The lock file */
HANDLE me_mfd; /**< just for writing the meta pages */
/** Failed to update the meta page. Probably an I/O error. */
#define MDB_FATAL_ERROR 0x80000000U
uint32_t me_flags;
uint32_t me_extrapad; /**< unused for now */
@ -805,8 +811,8 @@ static int mdb_read_data(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
static int mdb_rebalance(MDB_cursor *mc);
static int mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key);
static int mdb_move_node(MDB_cursor *csrcrc, MDB_cursor *cdstst);
static int mdb_merge(MDB_cursor *csrcrc, MDB_cursor *cdstst);
static int mdb_move_node(MDB_cursor *csrc, MDB_cursor *cdst);
static int mdb_merge(MDB_cursor *csrc, MDB_cursor *cdst);
static int mdb_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
pgno_t newpgno);
static MDB_page *mdb_new_page(MDB_cursor *mc, uint32_t flags, int num);
@ -842,6 +848,7 @@ static SECURITY_ATTRIBUTES mdb_all_sa;
static int mdb_sec_inited;
#endif
/** Return the library version info. */
char *
mdb_version(int *major, int *minor, int *patch)
{
@ -851,7 +858,7 @@ mdb_version(int *major, int *minor, int *patch)
return MDB_VERSION_STRING;
}
/** Table of descriptions for MDB @ref error codes */
/** Table of descriptions for MDB @ref errors */
static char *const mdb_errstr[] = {
"MDB_KEYEXIST: Key/data pair already exists",
"MDB_NOTFOUND: No matching key/data pair found",
@ -874,7 +881,12 @@ mdb_strerror(int err)
}
#if DEBUG
static char *
/** Display a key in hexadecimal and return the address of the result.
* @param[in] key the key to display
* @param[in] buf the buffer to write into. Should always be #DKBUF.
* @return The key in hexadecimal form.
*/
char *
mdb_dkey(MDB_val *key, char *buf)
{
char *ptr = buf;
@ -882,6 +894,9 @@ mdb_dkey(MDB_val *key, char *buf)
unsigned int i;
if (key->mv_size > MAXKEYSIZE)
return "MAXKEYSIZE";
/* may want to make this a dynamic check: if the key is mostly
* printable characters, print it as-is instead of converting to hex.
*/
#if 1
for (i=0; i<key->mv_size; i++)
ptr += sprintf(ptr, "%02x", *c++);
@ -898,6 +913,15 @@ mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
return txn->mt_dbxs[dbi].md_cmp(a, b);
}
/** Compare two data items according to a particular database.
* This returns a comparison as if the two items were data items of
* a sorted duplicates #MDB_DUPSORT database.
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @param[in] dbi A database handle returned by #mdb_open()
* @param[in] a The first item to compare
* @param[in] b The second item to compare
* @return < 0 if a < b, 0 if a == b, > 0 if a > b
*/
int
mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
{
@ -907,7 +931,15 @@ mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
return EINVAL; /* too bad you can't distinguish this from a valid result */
}
/* Allocate new page(s) for writing */
/** Allocate pages for writing.
* If there are free pages available from older transactions, they
* will be re-used first. Otherwise a new page will be allocated.
* @param[in] mc cursor A cursor handle identifying the transaction and
* database for which we are allocating.
* @param[in] num the number of pages to allocate.
* @return Address of the allocated page(s). Requests for multiple pages
* will always be satisfied by a single contiguous chunk of memory.
*/
static MDB_page *
mdb_alloc_page(MDB_cursor *mc, int num)
{
@ -1025,7 +1057,9 @@ mdb_alloc_page(MDB_cursor *mc, int num)
return np;
}
/* Touch a page: make it dirty and re-insert into tree with updated pgno.
/** Touch a page: make it dirty and re-insert into tree with updated pgno.
* @param[in] mc cursor pointing to the page to be touched
* @return 0 on success, non-zero on failure.
*/
static int
mdb_touch(MDB_cursor *mc)
@ -1047,7 +1081,9 @@ mdb_touch(MDB_cursor *mc)
mp->mp_flags |= P_DIRTY;
mc->mc_pg[mc->mc_top] = mp;
/* Update the page number to new touched page. */
/** If this page has a parent, update the parent to point to
* this new page.
*/
if (mc->mc_top)
SETPGNO(NODEPTR(mc->mc_pg[mc->mc_top-1], mc->mc_ki[mc->mc_top-1]), mp->mp_pgno);
}
@ -1068,6 +1104,12 @@ mdb_env_sync(MDB_env *env, int force)
static inline void
mdb_txn_reset0(MDB_txn *txn);
/** Common code for #mdb_txn_begin() and #mdb_txn_renew().
* @param[in] txn the transaction handle to initialize
* @return 0 on success, non-zero on failure. This can only
* fail for read-only transactions, and then only if the
* reader table is full.
*/
static inline int
mdb_txn_renew0(MDB_txn *txn)
{
@ -1181,6 +1223,9 @@ mdb_txn_begin(MDB_env *env, unsigned int flags, MDB_txn **ret)
return rc;
}
/** Common code for #mdb_txn_reset() and #mdb_txn_abort().
* @param[in] txn the transaction handle to reset
*/
static inline void
mdb_txn_reset0(MDB_txn *txn)
{
@ -1213,6 +1258,7 @@ mdb_txn_reset0(MDB_txn *txn)
env->me_txn = NULL;
for (i=2; i<env->me_numdbs; i++)
env->me_dbxs[i].md_dirty = 0;
/* The writer mutex was locked in mdb_txn_begin. */
UNLOCK_MUTEX_W(env);
}
}
@ -1510,6 +1556,12 @@ done:
return MDB_SUCCESS;
}
/** Read the environment parameters of a DB environment before
* mapping it into memory.
* @param[in] env the environment handle
* @param[out] meta address of where to store the meta information
* @return 0 on success, non-zero on failure.
*/
static int
mdb_env_read_header(MDB_env *env, MDB_meta *meta)
{
@ -1560,6 +1612,11 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta)
return 0;
}
/** Write the environment parameters of a freshly created DB environment.
* @param[in] env the environment handle
* @param[out] meta address of where to store the meta information
* @return 0 on success, non-zero on failure.
*/
static int
mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
{
@ -1610,6 +1667,10 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
return rc;
}
/** Update the environment info to commit a transaction.
* @param[in] txn the transaction that's being committed
* @return 0 on success, non-zero on failure.
*/
static int
mdb_env_write_meta(MDB_txn *txn)
{
@ -1690,6 +1751,11 @@ mdb_env_write_meta(MDB_txn *txn)
return MDB_SUCCESS;
}
/** Check both meta pages to see which one is newer.
* @param[in] env the environment handle
* @param[out] which address of where to store the meta toggle ID
* @return 0 on success, non-zero on failure.
*/
static int
mdb_env_read_meta(MDB_env *env, int *which)
{
@ -1759,6 +1825,8 @@ mdb_env_get_maxreaders(MDB_env *env, int *readers)
return MDB_SUCCESS;
}
/** Further setup required for opening an MDB environment
*/
static int
mdb_env_open2(MDB_env *env, unsigned int flags)
{
@ -2834,7 +2902,6 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
set1:
if (exactp)
*exactp = 1;
rc = 0;
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
goto set3;
}
@ -3216,9 +3283,11 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
goto top;
} else {
int exact = 0;
rc = mdb_cursor_set(mc, key, NULL, MDB_SET, &exact);
MDB_val d2;
rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact);
if (flags == MDB_NOOVERWRITE && rc == 0) {
DPRINTF("duplicate key [%s]", DKEY(key));
*data = d2;
return MDB_KEYEXIST;
}
if (rc && rc != MDB_NOTFOUND)
@ -3273,6 +3342,9 @@ top:
rdata = &xdata;
xdata.mv_size = sizeof(MDB_db);
xdata.mv_data = &dummy;
/* new sub-DB, must fully init xcursor */
if (flags == MDB_CURRENT)
flags = 0;
goto new_sub;
}
goto put_sub;
@ -4493,6 +4565,10 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
int
mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
{
/** Only a subset of the @ref mdb_env flags can be changed
* at runtime. Changing other flags requires closing the environment
* and re-opening it with the new flags.
*/
#define CHANGEABLE (MDB_NOSYNC)
if ((flag & CHANGEABLE) != flag)
return EINVAL;

View File

@ -151,7 +151,8 @@ typedef void (MDB_rel_func)(void *newptr, void *oldptr, size_t size);
#define MDB_REVERSEKEY 0x02
/** use sorted duplicates */
#define MDB_DUPSORT 0x04
/** numeric keys in native byte order */
/** numeric keys in native byte order.
* The keys must all be of the same size. */
#define MDB_INTEGERKEY 0x08
/** with #MDB_DUPSORT, sorted dup items have fixed size */
#define MDB_DUPFIXED 0x10
@ -200,7 +201,7 @@ typedef enum MDB_cursor_op {
MDB_PREV_DUP, /**< Position at previous data item of current key.
Only for #MDB_DUPSORT */
MDB_PREV_NODUP, /**< Position at last data item of previous key.
only for #MDB_DUPSORT */
Only for #MDB_DUPSORT */
MDB_SET, /**< Position at specified key */
MDB_SET_RANGE /**< Position at first key greater than or equal to specified key. */
} MDB_cursor_op;
@ -493,7 +494,7 @@ void mdb_txn_abort(MDB_txn *txn);
/** Reset a read-only transaction.
* This releases the current reader lock but doesn't free the
* transaction handle, allowing it to be used again later by #mdb_txn_renew().
* It otherwise has the same affect as #mdb_txn_abort() but saves some memory
* It otherwise has the same effect as #mdb_txn_abort() but saves some memory
* allocation/deallocation overhead if a thread is going to start a new
* read-only transaction again soon.
* All cursors opened within the transaction must be closed before the transaction
@ -539,11 +540,9 @@ int mdb_txn_renew(MDB_txn *txn);
* keys may have multiple data items, stored in sorted order.) By default
* keys must be unique and may have only a single data item.
* <li>#MDB_INTEGERKEY
* Keys are binary integers in native byte order. On Big-Endian systems
* this flag has no effect. On Little-Endian systems this flag behaves
* the same as #MDB_REVERSEKEY. This flag is simply provided as a
* convenience so that applications don't need to detect Endianness themselves
* when using integers as keys.
* Keys are binary integers in native byte order. Setting this option
* requires all keys to be the same size, typically sizeof(int)
* or sizeof(long).
* <li>#MDB_DUPFIXED
* This flag may only be used in combination with #MDB_DUPSORT. This option
* tells the library that the data items for this database are all the same
@ -553,6 +552,9 @@ int mdb_txn_renew(MDB_txn *txn);
* <li>#MDB_INTEGERDUP
* This option specifies that duplicate data items are also integers, and
* should be sorted as such.
* <li>#MDB_REVERSEDUP
* This option specifies that duplicate data items should be compared as
* strings in reverse order.
* <li>#MDB_CREATE
* Create the named database if it doesn't exist. This option is not
* allowed in a read-only transaction or a read-only environment.
@ -655,6 +657,11 @@ int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel);
* If the database supports duplicate keys (#MDB_DUPSORT) then the
* first data item for the key will be returned. Retrieval of other
* items requires the use of #mdb_cursor_get().
*
* @note The memory pointed to by the returned values is owned by the
* database. The caller need not dispose of the memory, and may not
* modify it in any way. For values returned in a read-only transaction
* any modification attempts will cause a SIGSEGV.
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @param[in] dbi A database handle returned by #mdb_open()
* @param[in] key The key to search for in the database
@ -676,7 +683,7 @@ int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data);
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @param[in] dbi A database handle returned by #mdb_open()
* @param[in] key The key to store in the database
* @param[in] data The data to store
* @param[in,out] data The data to store
* @param[in] flags Special options for this operation. This parameter
* must be set to 0 or by bitwise OR'ing together one or more of the
* values described here.
@ -689,7 +696,8 @@ int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data);
* <li>#MDB_NOOVERWRITE - enter the new key/data pair only if the key
* does not already appear in the database. The function will return
* #MDB_KEYEXIST if the key already appears in the database, even if
* the database supports duplicates (#MDB_DUPSORT).
* the database supports duplicates (#MDB_DUPSORT). The \b data
* parameter will be set to point to the existing item.
* </ul>
* @return A non-zero error value on failure and 0 on success. Some possible
* errors are: