diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index b5033628348..76bbe01730a 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.8 2003/11/29 19:51:40 pgsql Exp $ +$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.9 2006/01/17 00:09:00 tgl Exp $ This directory contains a correct implementation of Lehman and Yao's high-concurrency B-tree management algorithm (P. Lehman and S. Yao, @@ -325,15 +325,26 @@ work sometimes, but could cause failures later on depending on what else gets put on their page. "ScanKey" data structures are used in two fundamentally different ways -in this code. Searches for the initial position for a scan, as well as -insertions, use scankeys in which the comparison function is a 3-way -comparator (<0, =0, >0 result). These scankeys are built within the -btree code (eg, by _bt_mkscankey()) and used by _bt_compare(). Once we -are positioned, sequential examination of tuples in a scan is done by -_bt_checkkeys() using scankeys in which the comparison functions return -booleans --- for example, int4lt might be used. These scankeys are the -ones originally passed in from outside the btree code. Same -representation, but different comparison functions! +in this code, which we describe as "search" scankeys and "insertion" +scankeys. A search scankey is the kind passed to btbeginscan() or +btrescan() from outside the btree code. The sk_func pointers in a search +scankey point to comparison functions that return boolean, such as int4lt. +There might be more than one scankey entry for a given index column, or +none at all. (We require the keys to appear in index column order, but +the order of multiple keys for a given column is unspecified.) An +insertion scankey uses the same array-of-ScanKey data structure, but the +sk_func pointers point to btree comparison support functions (ie, 3-way +comparators that return int4 values interpreted as <0, =0, >0). In an +insertion scankey there is exactly one entry per index column. Insertion +scankeys are built within the btree code (eg, by _bt_mkscankey()) and are +used to locate the starting point of a scan, as well as for locating the +place to insert a new index tuple. (Note: in the case of an insertion +scankey built from a search scankey, there might be fewer keys than +index columns, indicating that we have no constraints for the remaining +index columns.) After we have located the starting point of a scan, the +original search scankey is consulted as each index entry is sequentially +scanned to decide whether to return the entry and whether the scan can +stop (see _bt_checkkeys()). Notes about data representation ------------------------------- diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index f634889219b..c88b6a9fd2b 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.130 2006/01/11 08:43:11 neilc Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.131 2006/01/17 00:09:00 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -80,7 +80,7 @@ _bt_doinsert(Relation rel, BTItem btitem, BTStack stack; Buffer buf; - /* we need a scan key to do our search, so build one */ + /* we need an insertion scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, itup); top: @@ -331,7 +331,8 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, * If 'afteritem' is >0 then the new tuple must be inserted after the * existing item of that number, noplace else. If 'afteritem' is 0 * then the procedure finds the exact spot to insert it by searching. - * (keysz and scankey parameters are used ONLY if afteritem == 0.) + * (keysz and scankey parameters are used ONLY if afteritem == 0. + * The scankey must be an insertion-type scankey.) * * NOTE: if the new key is equal to one or more existing keys, we can * legitimately place it anywhere in the series of equal keys --- in fact, diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index c356dc082f0..9827971603f 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.90 2005/11/22 18:17:06 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.91 2006/01/17 00:09:01 tgl Exp $ * * NOTES * Postgres btree pages look like ordinary relation pages. The opaque @@ -813,7 +813,7 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) * better drop the target page lock first. */ _bt_relbuf(rel, buf); - /* we need a scan key to do our search, so build one */ + /* we need an insertion scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, &(targetkey->bti_itup)); /* find the leftmost leaf page containing this key */ stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false, diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 4cd79997292..904629af30d 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.99 2005/12/07 19:37:53 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.100 2006/01/17 00:09:01 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -29,6 +29,9 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); * _bt_search() -- Search the tree for a particular scankey, * or more precisely for the first leaf page it could be on. * + * The passed scankey must be an insertion-type scankey (see nbtree/README), + * but it can omit the rightmost column(s) of the index. + * * When nextkey is false (the usual case), we are looking for the first * item >= scankey. When nextkey is true, we are looking for the first * item strictly greater than scankey. @@ -127,15 +130,18 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * data that appeared on the page originally is either on the page * or strictly to the right of it. * - * When nextkey is false (the usual case), we are looking for the first - * item >= scankey. When nextkey is true, we are looking for the first - * item strictly greater than scankey. - * * This routine decides whether or not we need to move right in the * tree by examining the high key entry on the page. If that entry * is strictly less than the scankey, or <= the scankey in the nextkey=true * case, then we followed the wrong link and we need to move right. * + * The passed scankey must be an insertion-type scankey (see nbtree/README), + * but it can omit the rightmost column(s) of the index. + * + * When nextkey is false (the usual case), we are looking for the first + * item >= scankey. When nextkey is true, we are looking for the first + * item strictly greater than scankey. + * * On entry, we have the buffer pinned and a lock of the type specified by * 'access'. If we move right, we release the buffer and lock and acquire * the same on the right sibling. Return value is the buffer we stop at. @@ -194,14 +200,13 @@ _bt_moveright(Relation rel, /* * _bt_binsrch() -- Do a binary search for a key on a particular page. * + * The passed scankey must be an insertion-type scankey (see nbtree/README), + * but it can omit the rightmost column(s) of the index. + * * When nextkey is false (the usual case), we are looking for the first * item >= scankey. When nextkey is true, we are looking for the first * item strictly greater than scankey. * - * The scankey we get has the compare function stored in the procedure - * entry of each data struct. We invoke this regproc to do the - * comparison for every key in the scankey. - * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey, or > scankey if nextkey is true. (NOTE: in * particular, this means it is possible to return a value 1 greater than the @@ -301,8 +306,11 @@ _bt_binsrch(Relation rel, /*---------- * _bt_compare() -- Compare scankey to a particular tuple on the page. * + * The passed scankey must be an insertion-type scankey (see nbtree/README), + * but it can omit the rightmost column(s) of the index. + * * keysz: number of key conditions to be checked (might be less than the - * total length of the scan key!) + * number of index columns!) * page/offnum: location of btree item to be compared to. * * This routine returns: @@ -464,12 +472,17 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) /* * _bt_first() -- Find the first item in a scan. * - * We need to be clever about the type of scan, the operation it's - * performing, and the tree ordering. We find the - * first item in the tree that satisfies the qualification - * associated with the scan descriptor. On exit, the page containing + * We need to be clever about the direction of scan, the search + * conditions, and the tree ordering. We find the first item (or, + * if backwards scan, the last item) in the tree that satisfies the + * qualifications in the scan key. On exit, the page containing * the current index tuple is read locked and pinned, and the scan's * opaque data entry is updated to include the buffer. + * + * Note that scan->keyData[], and the so->keyData[] scankey built from it, + * are both search-type scankeys (see nbtree/README for more about this). + * Within this routine, we build a temporary insertion-type scankey to use + * in locating the scan start position. */ bool _bt_first(IndexScanDesc scan, ScanDirection dir) @@ -537,6 +550,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * equality quals survive preprocessing, however, it doesn't matter which * one we use --- by definition, they are either redundant or * contradictory. + * + * The selected scan keys (at most one per index column) are remembered by + * storing their addresses into the local startKeys[] array. *---------- */ strat_total = BTEqualStrategyNumber; @@ -631,9 +647,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) return _bt_endpoint(scan, dir); /* - * We want to start the scan somewhere within the index. Set up a - * 3-way-comparison scankey we can use to search for the boundary point we - * identified above. + * We want to start the scan somewhere within the index. Set up an + * insertion scankey we can use to search for the boundary point we + * identified above. The insertion scankey is built in the local + * scankeys[] array, using the keys identified by startKeys[]. */ Assert(keysCount <= INDEX_MAX_KEYS); for (i = 0; i < keysCount; i++) @@ -681,19 +698,20 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } } - /* + /*---------- * Examine the selected initial-positioning strategy to determine exactly * where we need to start the scan, and set flag variables to control the * code below. * * If nextkey = false, _bt_search and _bt_binsrch will locate the first - * item >= scan key. If nextkey = true, they will locate the first item > - * scan key. + * item >= scan key. If nextkey = true, they will locate the first + * item > scan key. * - * If goback = true, we will then step back one item, while if goback = - * false, we will start the scan on the located item. + * If goback = true, we will then step back one item, while if + * goback = false, we will start the scan on the located item. * * it's yet other place to add some code later for is(not)null ... + *---------- */ switch (strat_total) { @@ -774,8 +792,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* - * Use the manufactured scan key to descend the tree and position - * ourselves on the target leaf page. + * Use the manufactured insertion scan key to descend the tree and + * position ourselves on the target leaf page. */ stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 7a18fcf0abb..ddd59444e28 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.67 2005/12/07 19:37:53 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.68 2006/01/17 00:09:01 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -23,7 +23,7 @@ /* * _bt_mkscankey - * Build a scan key that contains comparison data from itup + * Build an insertion scan key that contains comparison data from itup * as well as comparator routines appropriate to the key datatypes. * * The result is intended for use with _bt_compare(). @@ -67,11 +67,12 @@ _bt_mkscankey(Relation rel, IndexTuple itup) /* * _bt_mkscankey_nodata - * Build a scan key that contains comparator routines appropriate to - * the key datatypes, but no comparison data. The comparison data - * ultimately used must match the key datatypes. + * Build an insertion scan key that contains 3-way comparator routines + * appropriate to the key datatypes, but no comparison data. The + * comparison data ultimately used must match the key datatypes. * - * The result cannot be used with _bt_compare(). Currently this + * The result cannot be used with _bt_compare(), unless comparison + * data is first stored into the key entries. Currently this * routine is only called by nbtsort.c and tuplesort.c, which have * their own comparison routines. */ @@ -160,7 +161,7 @@ _bt_formitem(IndexTuple itup) /*---------- * _bt_preprocess_keys() -- Preprocess scan keys * - * The caller-supplied keys (in scan->keyData[]) are copied to + * The caller-supplied search-type keys (in scan->keyData[]) are copied to * so->keyData[] with possible transformation. scan->numberOfKeys is * the number of input keys, so->numberOfKeys gets the number of output * keys (possibly less, never greater). @@ -485,7 +486,7 @@ _bt_preprocess_keys(IndexScanDesc scan) * accordingly. See comments for _bt_preprocess_keys(), above, about how * this is done. * - * scan: index scan descriptor + * scan: index scan descriptor (containing a search-type scankey) * page: buffer page containing index tuple * offnum: offset number of index tuple (must be a valid item!) * dir: direction we are scanning in