Allow skipping some items in a multi-key GIN search.

In a multi-key search, ie. something like "col @> 'foo' AND col @> 'bar'",
as soon as we find the next item that matches the first criteria, we don't
need to check the second criteria for TIDs smaller the first match. That
saves a lot of effort, especially if one of the terms is rare, while the
second occurs very frequently.

Based on ideas from Alexander Korotkov's fast scan patch.
This commit is contained in:
Heikki Linnakangas 2014-01-29 17:53:39 +02:00
parent 2013e5eef7
commit e20c70cb0f

View File

@ -67,29 +67,6 @@ callConsistentFn(GinState *ginstate, GinScanKey key)
PointerGetDatum(key->queryCategories))); PointerGetDatum(key->queryCategories)));
} }
/*
* Tries to refind previously taken ItemPointer on a posting page.
*/
static bool
needToStepRight(Page page, ItemPointer item)
{
if (GinPageGetOpaque(page)->flags & GIN_DELETED)
/* page was deleted by concurrent vacuum */
return true;
if (ginCompareItemPointers(item, GinDataPageGetRightBound(page)) > 0
&& !GinPageRightMost(page))
{
/*
* the item we're looking is > the right bound of the page, so it
* can't be on this page.
*/
return true;
}
return false;
}
/* /*
* Goes to the next page if current offset is outside of bounds * Goes to the next page if current offset is outside of bounds
*/ */
@ -447,8 +424,7 @@ restartScanEntry:
page = BufferGetPage(entry->buffer); page = BufferGetPage(entry->buffer);
/* /*
* Copy page content to memory to avoid keeping it locked for * Load the first page into memory.
* a long time.
*/ */
entry->list = GinDataLeafPageGetItems(page, &entry->nlist); entry->list = GinDataLeafPageGetItems(page, &entry->nlist);
@ -518,88 +494,78 @@ startScan(IndexScanDesc scan)
} }
/* /*
* Gets next ItemPointer from PostingTree. Note, that we copy * Load the next batch of item pointers from a posting tree.
* page into GinScanEntry->list array and unlock page, but keep it pinned *
* to prevent interference with vacuum * Note that we copy the page into GinScanEntry->list array and unlock it, but
* keep it pinned to prevent interference with vacuum.
*/ */
static void static void
entryGetNextItem(GinState *ginstate, GinScanEntry entry) entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advancePast)
{ {
Page page; Page page;
int i; int i;
for (;;)
{
if (entry->offset < entry->nlist)
{
entry->curItem = entry->list[entry->offset++];
return;
}
LockBuffer(entry->buffer, GIN_SHARE); LockBuffer(entry->buffer, GIN_SHARE);
page = BufferGetPage(entry->buffer); page = BufferGetPage(entry->buffer);
for (;;) for (;;)
{ {
/*
* It's needed to go by right link. During that we should refind
* first ItemPointer greater that stored
*/
if (GinPageRightMost(page))
{
UnlockReleaseBuffer(entry->buffer);
ItemPointerSetInvalid(&entry->curItem);
entry->buffer = InvalidBuffer;
entry->isFinished = TRUE;
return;
}
entry->buffer = ginStepRight(entry->buffer,
ginstate->index,
GIN_SHARE);
page = BufferGetPage(entry->buffer);
entry->offset = InvalidOffsetNumber; entry->offset = InvalidOffsetNumber;
if (entry->list) if (entry->list)
{ {
pfree(entry->list); pfree(entry->list);
entry->list = NULL; entry->list = NULL;
entry->nlist = 0;
} }
/* /*
* If the page was concurrently split, we have to re-find the * We've processed all the entries on this page. If it was the last
* item we were stopped on. If the page was split more than once, * page in the tree, we're done.
* the item might not be on this page, but somewhere to the right. */
* Keep following the right-links until we re-find the correct if (GinPageRightMost(page))
{
UnlockReleaseBuffer(entry->buffer);
entry->buffer = InvalidBuffer;
entry->isFinished = TRUE;
return;
}
if (GinPageGetOpaque(page)->flags & GIN_DELETED)
continue; /* page was deleted by concurrent vacuum */
/*
* Step to next page, following the right link. then find the first
* ItemPointer greater than advancePast.
*/
entry->buffer = ginStepRight(entry->buffer,
ginstate->index,
GIN_SHARE);
page = BufferGetPage(entry->buffer);
/*
* The first item > advancePast might not be on this page, but
* somewhere to the right, if the page was split, or a non-match from
* another key in the query allowed us to skip some items from this
* entry. Keep following the right-links until we re-find the correct
* page. * page.
*/ */
if (ItemPointerIsValid(&entry->curItem) && if (!GinPageRightMost(page) &&
needToStepRight(page, &entry->curItem)) ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
{ {
/*
* the item we're looking is > the right bound of the page, so it
* can't be on this page.
*/
continue; continue;
} }
entry->list = GinDataLeafPageGetItems(page, &entry->nlist); entry->list = GinDataLeafPageGetItems(page, &entry->nlist);
/* re-find the item we were stopped on. */
if (ItemPointerIsValid(&entry->curItem))
{
for (i = 0; i < entry->nlist; i++) for (i = 0; i < entry->nlist; i++)
{ {
if (ginCompareItemPointers(&entry->curItem, if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
&entry->list[i]) < 0)
{ {
LockBuffer(entry->buffer, GIN_UNLOCK); LockBuffer(entry->buffer, GIN_UNLOCK);
entry->offset = i + 1; entry->offset = i;
entry->curItem = entry->list[entry->offset - 1];
return;
}
}
}
else
{
LockBuffer(entry->buffer, GIN_UNLOCK);
entry->offset = 1; /* scan all items on the page. */
entry->curItem = entry->list[entry->offset - 1];
return; return;
} }
} }
@ -610,10 +576,10 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
/* /*
* Sets entry->curItem to next heap item pointer for one entry of one scan key, * Sets entry->curItem to next heap item pointer > advancePast, for one entry
* or sets entry->isFinished to TRUE if there are no more. * of one scan key, or sets entry->isFinished to TRUE if there are no more.
* *
* Item pointers must be returned in ascending order. * Item pointers are returned in ascending order.
* *
* Note: this can return a "lossy page" item pointer, indicating that the * Note: this can return a "lossy page" item pointer, indicating that the
* entry potentially matches all items on that heap page. However, it is * entry potentially matches all items on that heap page. However, it is
@ -623,12 +589,20 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
* current implementation this is guaranteed by the behavior of tidbitmaps. * current implementation this is guaranteed by the behavior of tidbitmaps.
*/ */
static void static void
entryGetItem(GinState *ginstate, GinScanEntry entry) entryGetItem(GinState *ginstate, GinScanEntry entry,
ItemPointerData advancePast)
{ {
Assert(!entry->isFinished); Assert(!entry->isFinished);
Assert(!ItemPointerIsValid(&entry->curItem) ||
ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
if (entry->matchBitmap) if (entry->matchBitmap)
{ {
/* A bitmap result */
BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
do do
{ {
if (entry->matchResult == NULL || if (entry->matchResult == NULL ||
@ -645,6 +619,18 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
break; break;
} }
/*
* If all the matches on this page are <= advancePast, skip
* to next page.
*/
if (entry->matchResult->blockno < advancePastBlk ||
(entry->matchResult->blockno == advancePastBlk &&
entry->matchResult->offsets[entry->offset] <= advancePastOff))
{
entry->offset = entry->matchResult->ntuples;
continue;
}
/* /*
* Reset counter to the beginning of entry->matchResult. Note: * Reset counter to the beginning of entry->matchResult. Note:
* entry->offset is still greater than matchResult->ntuples if * entry->offset is still greater than matchResult->ntuples if
@ -670,6 +656,17 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
break; break;
} }
if (entry->matchResult->blockno == advancePastBlk)
{
/*
* Skip to the right offset on this page. We already checked
* in above loop that there is at least one item > advancePast
* on the page.
*/
while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
entry->offset++;
}
ItemPointerSet(&entry->curItem, ItemPointerSet(&entry->curItem,
entry->matchResult->blockno, entry->matchResult->blockno,
entry->matchResult->offsets[entry->offset]); entry->matchResult->offsets[entry->offset]);
@ -678,29 +675,48 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
} }
else if (!BufferIsValid(entry->buffer)) else if (!BufferIsValid(entry->buffer))
{ {
entry->offset++; /* A posting list from an entry tuple */
if (entry->offset <= entry->nlist) do
entry->curItem = entry->list[entry->offset - 1]; {
else if (entry->offset >= entry->nlist)
{ {
ItemPointerSetInvalid(&entry->curItem); ItemPointerSetInvalid(&entry->curItem);
entry->isFinished = TRUE; entry->isFinished = TRUE;
break;
} }
entry->curItem = entry->list[entry->offset++];
} while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
/* XXX: shouldn't we apply the fuzzy search limit here? */
} }
else else
{ {
/* A posting tree */
do do
{ {
entryGetNextItem(ginstate, entry); /* If we've processed the current batch, load more items */
} while (entry->isFinished == FALSE && while (entry->offset >= entry->nlist)
entry->reduceResult == TRUE && {
dropItem(entry)); entryLoadMoreItems(ginstate, entry, advancePast);
if (entry->isFinished)
{
ItemPointerSetInvalid(&entry->curItem);
return;
}
}
entry->curItem = entry->list[entry->offset++];
} while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 ||
(entry->reduceResult == TRUE && dropItem(entry)));
} }
} }
/* /*
* Identify the "current" item among the input entry streams for this scan key, * Identify the "current" item among the input entry streams for this scan key
* and test whether it passes the scan key qual condition. * that is greater than advancePast, and test whether it passes the scan key
* qual condition.
* *
* The current item is the smallest curItem among the inputs. key->curItem * The current item is the smallest curItem among the inputs. key->curItem
* is set to that value. key->curItemMatches is set to indicate whether that * is set to that value. key->curItemMatches is set to indicate whether that
@ -719,7 +735,8 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
* logic in scanGetItem.) * logic in scanGetItem.)
*/ */
static void static void
keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
ItemPointerData advancePast)
{ {
ItemPointerData minItem; ItemPointerData minItem;
ItemPointerData curPageLossy; ItemPointerData curPageLossy;
@ -729,11 +746,20 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
GinScanEntry entry; GinScanEntry entry;
bool res; bool res;
MemoryContext oldCtx; MemoryContext oldCtx;
bool allFinished;
Assert(!key->isFinished); Assert(!key->isFinished);
/* /*
* Find the minimum of the active entry curItems. * We might have already tested this item; if so, no need to repeat work.
* (Note: the ">" case can happen, if minItem is exact but we previously
* had to set curItem to a lossy-page pointer.)
*/
if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
return;
/*
* Find the minimum item > advancePast among the active entry streams.
* *
* Note: a lossy-page entry is encoded by a ItemPointer with max value for * Note: a lossy-page entry is encoded by a ItemPointer with max value for
* offset (0xffff), so that it will sort after any exact entries for the * offset (0xffff), so that it will sort after any exact entries for the
@ -741,16 +767,33 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
* pointers, which is good. * pointers, which is good.
*/ */
ItemPointerSetMax(&minItem); ItemPointerSetMax(&minItem);
allFinished = true;
for (i = 0; i < key->nentries; i++) for (i = 0; i < key->nentries; i++)
{ {
entry = key->scanEntry[i]; entry = key->scanEntry[i];
if (entry->isFinished == FALSE &&
ginCompareItemPointers(&entry->curItem, &minItem) < 0) /*
minItem = entry->curItem; * Advance this stream if necessary.
*
* In particular, since entry->curItem was initialized with
* ItemPointerSetMin, this ensures we fetch the first item for each
* entry on the first call.
*/
while (entry->isFinished == FALSE &&
ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
{
entryGetItem(ginstate, entry, advancePast);
} }
if (ItemPointerIsMax(&minItem)) if (!entry->isFinished)
{
allFinished = FALSE;
if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
minItem = entry->curItem;
}
}
if (allFinished)
{ {
/* all entries are finished */ /* all entries are finished */
key->isFinished = TRUE; key->isFinished = TRUE;
@ -758,15 +801,7 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
} }
/* /*
* We might have already tested this item; if so, no need to repeat work. * OK, set key->curItem and perform consistentFn test.
* (Note: the ">" case can happen, if minItem is exact but we previously
* had to set curItem to a lossy-page pointer.)
*/
if (ginCompareItemPointers(&key->curItem, &minItem) >= 0)
return;
/*
* OK, advance key->curItem and perform consistentFn test.
*/ */
key->curItem = minItem; key->curItem = minItem;
@ -895,72 +930,18 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
* keyGetItem() the combination logic is known only to the consistentFn. * keyGetItem() the combination logic is known only to the consistentFn.
*/ */
static bool static bool
scanGetItem(IndexScanDesc scan, ItemPointer advancePast, scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
ItemPointerData *item, bool *recheck) ItemPointerData *item, bool *recheck)
{ {
GinScanOpaque so = (GinScanOpaque) scan->opaque; GinScanOpaque so = (GinScanOpaque) scan->opaque;
GinState *ginstate = &so->ginstate;
ItemPointerData myAdvancePast = *advancePast;
uint32 i; uint32 i;
bool allFinished;
bool match; bool match;
for (;;)
{
/*
* Advance any entries that are <= myAdvancePast. In particular,
* since entry->curItem was initialized with ItemPointerSetMin, this
* ensures we fetch the first item for each entry on the first call.
*/
allFinished = TRUE;
for (i = 0; i < so->totalentries; i++)
{
GinScanEntry entry = so->entries[i];
while (entry->isFinished == FALSE &&
ginCompareItemPointers(&entry->curItem,
&myAdvancePast) <= 0)
entryGetItem(ginstate, entry);
if (entry->isFinished == FALSE)
allFinished = FALSE;
}
if (allFinished)
{
/* all entries exhausted, so we're done */
return false;
}
/*
* Perform the consistentFn test for each scan key. If any key
* reports isFinished, meaning its subset of the entries is exhausted,
* we can stop. Otherwise, set *item to the minimum of the key
* curItems.
*/
ItemPointerSetMax(item);
for (i = 0; i < so->nkeys; i++)
{
GinScanKey key = so->keys + i;
keyGetItem(&so->ginstate, so->tempCtx, key);
if (key->isFinished)
return false; /* finished one of keys */
if (ginCompareItemPointers(&key->curItem, item) < 0)
*item = key->curItem;
}
Assert(!ItemPointerIsMax(item));
/*---------- /*----------
* Now *item contains first ItemPointer after previous result. * Advance the scan keys in lock-step, until we find an item that matches
* * all the keys. If any key reports isFinished, meaning its subset of the
* The item is a valid hit only if all the keys succeeded for either * entries is exhausted, we can stop. Otherwise, set *item to the next
* that exact TID, or a lossy reference to the same page. * matching item.
* *
* This logic works only if a keyGetItem stream can never contain both * This logic works only if a keyGetItem stream can never contain both
* exact and lossy pointers for the same page. Else we could have a * exact and lossy pointers for the same page. Else we could have a
@ -977,35 +958,94 @@ scanGetItem(IndexScanDesc scan, ItemPointer advancePast,
* (keyGetItem has a similar problem versus entryGetItem.) * (keyGetItem has a similar problem versus entryGetItem.)
*---------- *----------
*/ */
do
{
ItemPointerSetMin(item);
match = true; match = true;
for (i = 0; i < so->nkeys; i++) for (i = 0; i < so->nkeys && match; i++)
{ {
GinScanKey key = so->keys + i; GinScanKey key = so->keys + i;
if (key->curItemMatches) /* Fetch the next item for this key that is > advancePast. */
keyGetItem(&so->ginstate, so->tempCtx, key, advancePast);
if (key->isFinished)
return false;
/*
* If it's not a match, we can immediately conclude that nothing
* <= this item matches, without checking the rest of the keys.
*/
if (!key->curItemMatches)
{ {
if (ginCompareItemPointers(item, &key->curItem) == 0) advancePast = key->curItem;
continue;
if (ItemPointerIsLossyPage(&key->curItem) &&
GinItemPointerGetBlockNumber(&key->curItem) ==
GinItemPointerGetBlockNumber(item))
continue;
}
match = false; match = false;
break; break;
} }
if (match)
break;
/* /*
* No hit. Update myAdvancePast to this TID, so that on the next pass * It's a match. We can conclude that nothing < matches, so
* we'll move to the next possible entry. * the other key streams can skip to this item.
*
* Beware of lossy pointers, though; from a lossy pointer, we
* can only conclude that nothing smaller than this *block*
* matches.
*/ */
myAdvancePast = *item; if (ItemPointerIsLossyPage(&key->curItem))
{
if (GinItemPointerGetBlockNumber(&advancePast) <
GinItemPointerGetBlockNumber(&key->curItem))
{
advancePast.ip_blkid = key->curItem.ip_blkid;
advancePast.ip_posid = 0;
}
}
else
{
Assert(key->curItem.ip_posid > 0);
advancePast = key->curItem;
advancePast.ip_posid--;
} }
/* /*
* If this is the first key, remember this location as a
* potential match.
*
* Otherwise, check if this is the same item that we checked the
* previous keys for (or a lossy pointer for the same page). If
* not, loop back to check the previous keys for this item (we
* will check this key again too, but keyGetItem returns quickly
* for that)
*/
if (i == 0)
{
*item = key->curItem;
}
else
{
if (ItemPointerIsLossyPage(&key->curItem) ||
ItemPointerIsLossyPage(item))
{
Assert (GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
match = (GinItemPointerGetBlockNumber(&key->curItem) ==
GinItemPointerGetBlockNumber(item));
}
else
{
Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
match = (ginCompareItemPointers(&key->curItem, item) == 0);
}
}
}
} while (!match);
Assert(!ItemPointerIsMin(item));
/*
* Now *item contains the first ItemPointer after previous result that
* satisfied all the keys for that exact TID, or a lossy reference
* to the same page.
*
* We must return recheck = true if any of the keys are marked recheck. * We must return recheck = true if any of the keys are marked recheck.
*/ */
*recheck = false; *recheck = false;
@ -1536,7 +1576,7 @@ gingetbitmap(PG_FUNCTION_ARGS)
{ {
CHECK_FOR_INTERRUPTS(); CHECK_FOR_INTERRUPTS();
if (!scanGetItem(scan, &iptr, &iptr, &recheck)) if (!scanGetItem(scan, iptr, &iptr, &recheck))
break; break;
if (ItemPointerIsLossyPage(&iptr)) if (ItemPointerIsLossyPage(&iptr))