diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index b68daa55ae..76ac0fcddd 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -809,7 +809,8 @@ amrestrpos (IndexScanDesc scan);
Size
-amestimateparallelscan (void);
+amestimateparallelscan (int nkeys,
+ int norderbys);
Estimate and return the number of bytes of dynamic shared memory which
the access method will be needed to perform a parallel scan. (This number
@@ -817,6 +818,13 @@ amestimateparallelscan (void);
AM-independent data in ParallelIndexScanDescData.)
+
+ The nkeys and norderbys
+ parameters indicate the number of quals and ordering operators that will be
+ used in the scan; the same values will be passed to amrescan.
+ Note that the actual values of the scan keys aren't provided yet.
+
+
It is not necessary to implement this function for access methods which
do not support parallel scans or for which the number of additional bytes
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index e1e96ba7c4..053da8d6e4 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -4064,6 +4064,19 @@ description | Waiting for a newly initialized WAL file to reach durable storage
+
+
+ Queries that use certain SQL constructs to search for
+ rows matching any value out of a list or array of multiple scalar values
+ (see ) perform multiple
+ primitive
index scans (up to one primitive scan per scalar
+ value) during query execution. Each internal primitive index scan
+ increments pg_stat_all_indexes.idx_scan,
+ so it's possible for the count of index scans to significantly exceed the
+ total number of index scan executor node executions.
+
+
+
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 78ac3b1abb..7510159fc8 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -449,13 +449,10 @@ index_restrpos(IndexScanDesc scan)
/*
* index_parallelscan_estimate - estimate shared memory for parallel scan
- *
- * Currently, we don't pass any information to the AM-specific estimator,
- * so it can probably only return a constant. In the future, we might need
- * to pass more information.
*/
Size
-index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot)
+index_parallelscan_estimate(Relation indexRelation, int nkeys, int norderbys,
+ Snapshot snapshot)
{
Size nbytes;
@@ -474,7 +471,8 @@ index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot)
*/
if (indexRelation->rd_indam->amestimateparallelscan != NULL)
nbytes = add_size(nbytes,
- indexRelation->rd_indam->amestimateparallelscan());
+ indexRelation->rd_indam->amestimateparallelscan(nkeys,
+ norderbys));
return nbytes;
}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 41df1027d2..686a3206f7 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -40,6 +40,9 @@
/*
* BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
*
+ * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
+ * scan to advance it via another call to _bt_first.
+ *
* BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
* a new page; others must wait.
*
@@ -47,11 +50,11 @@
* to a new page; some process can start doing that.
*
* BTPARALLEL_DONE indicates that the scan is complete (including error exit).
- * We reach this state once for every distinct combination of array keys.
*/
typedef enum
{
BTPARALLEL_NOT_INITIALIZED,
+ BTPARALLEL_NEED_PRIMSCAN,
BTPARALLEL_ADVANCING,
BTPARALLEL_IDLE,
BTPARALLEL_DONE,
@@ -67,10 +70,14 @@ typedef struct BTParallelScanDescData
BTPS_State btps_pageStatus; /* indicates whether next page is
* available for scan. see above for
* possible states of parallel scan. */
- int btps_arrayKeyCount; /* count indicating number of array scan
- * keys processed by parallel scan */
- slock_t btps_mutex; /* protects above variables */
+ slock_t btps_mutex; /* protects above variables, btps_arrElems */
ConditionVariable btps_cv; /* used to synchronize parallel scan */
+
+ /*
+ * btps_arrElems is used when scans need to schedule another primitive
+ * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
+ */
+ int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
} BTParallelScanDescData;
typedef struct BTParallelScanDescData *BTParallelScanDesc;
@@ -204,21 +211,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
/* btree indexes are never lossy */
scan->xs_recheck = false;
- /*
- * If we have any array keys, initialize them during first call for a
- * scan. We can't do this in btrescan because we don't know the scan
- * direction at that time.
- */
- if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
- {
- /* punt if we have any unsatisfiable array keys */
- if (so->numArrayKeys < 0)
- return false;
-
- _bt_start_array_keys(scan, dir);
- }
-
- /* This loop handles advancing to the next array elements, if any */
+ /* Each loop iteration performs another primitive index scan */
do
{
/*
@@ -260,8 +253,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
/* If we have a tuple, return it ... */
if (res)
break;
- /* ... otherwise see if we have more array keys to deal with */
- } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
+ /* ... otherwise see if we need another primitive index scan */
+ } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
return res;
}
@@ -276,19 +269,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
int64 ntids = 0;
ItemPointer heapTid;
- /*
- * If we have any array keys, initialize them.
- */
- if (so->numArrayKeys)
- {
- /* punt if we have any unsatisfiable array keys */
- if (so->numArrayKeys < 0)
- return ntids;
-
- _bt_start_array_keys(scan, ForwardScanDirection);
- }
-
- /* This loop handles advancing to the next array elements, if any */
+ /* Each loop iteration performs another primitive index scan */
do
{
/* Fetch the first page & tuple */
@@ -318,8 +299,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
ntids++;
}
}
- /* Now see if we have more array keys to deal with */
- } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
+ /* Now see if we need another primitive index scan */
+ } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
return ntids;
}
@@ -348,10 +329,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
else
so->keyData = NULL;
- so->arrayKeyData = NULL; /* assume no array keys for now */
- so->arraysStarted = false;
- so->numArrayKeys = 0;
+ so->needPrimScan = false;
+ so->scanBehind = false;
so->arrayKeys = NULL;
+ so->orderProcs = NULL;
so->arrayContext = NULL;
so->killedItems = NULL; /* until needed */
@@ -391,7 +372,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
}
so->markItemIndex = -1;
- so->arrayKeyCount = 0;
+ so->needPrimScan = false;
+ so->scanBehind = false;
BTScanPosUnpinIfPinned(so->markPos);
BTScanPosInvalidate(so->markPos);
@@ -425,9 +407,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
scankey,
scan->numberOfKeys * sizeof(ScanKeyData));
so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
-
- /* If any keys are SK_SEARCHARRAY type, set up array-key info */
- _bt_preprocess_array_keys(scan);
+ so->numArrayKeys = 0; /* ditto */
}
/*
@@ -455,7 +435,7 @@ btendscan(IndexScanDesc scan)
/* Release storage */
if (so->keyData != NULL)
pfree(so->keyData);
- /* so->arrayKeyData and so->arrayKeys are in arrayContext */
+ /* so->arrayKeys and so->orderProcs are in arrayContext */
if (so->arrayContext != NULL)
MemoryContextDelete(so->arrayContext);
if (so->killedItems != NULL)
@@ -490,10 +470,6 @@ btmarkpos(IndexScanDesc scan)
BTScanPosInvalidate(so->markPos);
so->markItemIndex = -1;
}
-
- /* Also record the current positions of any array keys */
- if (so->numArrayKeys)
- _bt_mark_array_keys(scan);
}
/*
@@ -504,10 +480,6 @@ btrestrpos(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- /* Restore the marked positions of any array keys */
- if (so->numArrayKeys)
- _bt_restore_array_keys(scan);
-
if (so->markItemIndex >= 0)
{
/*
@@ -546,6 +518,12 @@ btrestrpos(IndexScanDesc scan)
if (so->currTuples)
memcpy(so->currTuples, so->markTuples,
so->markPos.nextTupleOffset);
+ /* Reset the scan's array keys (see _bt_steppage for why) */
+ if (so->numArrayKeys)
+ {
+ _bt_start_array_keys(scan, so->currPos.dir);
+ so->needPrimScan = false;
+ }
}
else
BTScanPosInvalidate(so->currPos);
@@ -556,9 +534,10 @@ btrestrpos(IndexScanDesc scan)
* btestimateparallelscan -- estimate storage for BTParallelScanDescData
*/
Size
-btestimateparallelscan(void)
+btestimateparallelscan(int nkeys, int norderbys)
{
- return sizeof(BTParallelScanDescData);
+ /* Pessimistically assume all input scankeys will be output with arrays */
+ return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
}
/*
@@ -572,7 +551,6 @@ btinitparallelscan(void *target)
SpinLockInit(&bt_target->btps_mutex);
bt_target->btps_scanPage = InvalidBlockNumber;
bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
- bt_target->btps_arrayKeyCount = 0;
ConditionVariableInit(&bt_target->btps_cv);
}
@@ -598,7 +576,6 @@ btparallelrescan(IndexScanDesc scan)
SpinLockAcquire(&btscan->btps_mutex);
btscan->btps_scanPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
- btscan->btps_arrayKeyCount = 0;
SpinLockRelease(&btscan->btps_mutex);
}
@@ -608,23 +585,26 @@ btparallelrescan(IndexScanDesc scan)
* or _bt_parallel_done().
*
* The return value is true if we successfully seized the scan and false
- * if we did not. The latter case occurs if no pages remain for the current
- * set of scankeys.
+ * if we did not. The latter case occurs if no pages remain.
*
* If the return value is true, *pageno returns the next or current page
* of the scan (depending on the scan direction). An invalid block number
- * means the scan hasn't yet started, and P_NONE means we've reached the end.
+ * means the scan hasn't yet started, or that caller needs to start the next
+ * primitive index scan (if it's the latter case we'll set so.needPrimScan).
* The first time a participating process reaches the last page, it will return
* true and set *pageno to P_NONE; after that, further attempts to seize the
* scan will return false.
*
* Callers should ignore the value of pageno if the return value is false.
+ *
+ * Callers that are in a position to start a new primitive index scan must
+ * pass first=true (all other callers pass first=false). We just return false
+ * for first=false callers that require another primitive index scan.
*/
bool
-_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- BTPS_State pageStatus;
bool exit_loop = false;
bool status = true;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
@@ -632,28 +612,69 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
*pageno = P_NONE;
+ if (first)
+ {
+ /*
+ * Initialize array related state when called from _bt_first, assuming
+ * that this will either be the first primitive index scan for the
+ * scan, or a previous explicitly scheduled primitive scan.
+ *
+ * Note: so->needPrimScan is only set when a scheduled primitive index
+ * scan is set to be performed in caller's worker process. It should
+ * not be set here by us for the first primitive scan, nor should we
+ * ever set it for a parallel scan that has no array keys.
+ */
+ so->needPrimScan = false;
+ so->scanBehind = false;
+ }
+ else
+ {
+ /*
+ * Don't attempt to seize the scan when backend requires another
+ * primitive index scan unless we're in a position to start it now
+ */
+ if (so->needPrimScan)
+ return false;
+ }
+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
while (1)
{
SpinLockAcquire(&btscan->btps_mutex);
- pageStatus = btscan->btps_pageStatus;
- if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
+ if (btscan->btps_pageStatus == BTPARALLEL_DONE)
{
- /* Parallel scan has already advanced to a new set of scankeys. */
+ /* We're done with this parallel index scan */
status = false;
}
- else if (pageStatus == BTPARALLEL_DONE)
+ else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
{
+ Assert(so->numArrayKeys);
+
/*
- * We're done with this set of scankeys. This may be the end, or
- * there could be more sets to try.
+ * If we can start another primitive scan right away, do so.
+ * Otherwise just wait.
*/
- status = false;
+ if (first)
+ {
+ btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
+ for (int i = 0; i < so->numArrayKeys; i++)
+ {
+ BTArrayKeyInfo *array = &so->arrayKeys[i];
+ ScanKey skey = &so->keyData[array->scan_key];
+
+ array->cur_elem = btscan->btps_arrElems[i];
+ skey->sk_argument = array->elem_values[array->cur_elem];
+ }
+ so->needPrimScan = true;
+ so->scanBehind = false;
+ *pageno = InvalidBlockNumber;
+ exit_loop = true;
+ }
}
- else if (pageStatus != BTPARALLEL_ADVANCING)
+ else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
{
/*
* We have successfully seized control of the scan for the purpose
@@ -677,6 +698,12 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
* _bt_parallel_release() -- Complete the process of advancing the scan to a
* new page. We now have the new value btps_scanPage; some other backend
* can now begin advancing the scan.
+ *
+ * Callers whose scan uses array keys must save their scan_page argument so
+ * that it can be passed to _bt_parallel_primscan_schedule, should caller
+ * determine that another primitive index scan is required. If that happens,
+ * scan_page won't be scanned by any backend (unless the next primitive index
+ * scan lands on scan_page).
*/
void
_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
@@ -704,7 +731,6 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
void
_bt_parallel_done(IndexScanDesc scan)
{
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
bool status_changed = false;
@@ -717,13 +743,11 @@ _bt_parallel_done(IndexScanDesc scan)
parallel_scan->ps_offset);
/*
- * Mark the parallel scan as done for this combination of scan keys,
- * unless some other process already did so. See also
- * _bt_advance_array_keys.
+ * Mark the parallel scan as done, unless some other process did so
+ * already
*/
SpinLockAcquire(&btscan->btps_mutex);
- if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
- btscan->btps_pageStatus != BTPARALLEL_DONE)
+ if (btscan->btps_pageStatus != BTPARALLEL_DONE)
{
btscan->btps_pageStatus = BTPARALLEL_DONE;
status_changed = true;
@@ -736,29 +760,39 @@ _bt_parallel_done(IndexScanDesc scan)
}
/*
- * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
- * keys.
+ * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
*
- * Updates the count of array keys processed for both local and parallel
- * scans.
+ * Caller passes the block number most recently passed to _bt_parallel_release
+ * by its backend. Caller successfully schedules the next primitive index scan
+ * if the shared parallel state hasn't been seized since caller's backend last
+ * advanced the scan.
*/
void
-_bt_parallel_advance_array_keys(IndexScanDesc scan)
+_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
+ Assert(so->numArrayKeys);
+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
- so->arrayKeyCount++;
SpinLockAcquire(&btscan->btps_mutex);
- if (btscan->btps_pageStatus == BTPARALLEL_DONE)
+ if (btscan->btps_scanPage == prev_scan_page &&
+ btscan->btps_pageStatus == BTPARALLEL_IDLE)
{
btscan->btps_scanPage = InvalidBlockNumber;
- btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
- btscan->btps_arrayKeyCount++;
+ btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
+
+ /* Serialize scan's current array keys */
+ for (int i = 0; i < so->numArrayKeys; i++)
+ {
+ BTArrayKeyInfo *array = &so->arrayKeys[i];
+
+ btscan->btps_arrElems[i] = array->cur_elem;
+ }
}
SpinLockRelease(&btscan->btps_mutex);
}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index e3fff90d8e..d241e8ea1d 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -907,7 +907,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
*/
if (!so->qual_ok)
{
- /* Notify any other workers that we're done with this scan key. */
_bt_parallel_done(scan);
return false;
}
@@ -917,10 +916,22 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* scan has not started, proceed to find out first leaf page in the usual
* way while keeping other participating processes waiting. If the scan
* has already begun, use the page number from the shared structure.
+ *
+ * When a parallel scan has another primitive index scan scheduled, a
+ * parallel worker will seize the scan for that purpose now. This is
+ * similar to the case where the top-level scan hasn't started.
*/
if (scan->parallel_scan != NULL)
{
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, true);
+
+ /*
+ * Initialize arrays (when _bt_parallel_seize didn't already set up
+ * the next primitive index scan)
+ */
+ if (so->numArrayKeys && !so->needPrimScan)
+ _bt_start_array_keys(scan, dir);
+
if (!status)
return false;
else if (blkno == P_NONE)
@@ -935,6 +946,16 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
goto readcomplete;
}
}
+ else if (so->numArrayKeys && !so->needPrimScan)
+ {
+ /*
+ * First _bt_first call (for current btrescan) without parallelism.
+ *
+ * Initialize arrays, and the corresponding scan keys that were just
+ * output by _bt_preprocess_keys.
+ */
+ _bt_start_array_keys(scan, dir);
+ }
/*----------
* Examine the scan keys to discover where we need to start the scan.
@@ -980,6 +1001,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
*
* The selected scan keys (at most one per index column) are remembered by
* storing their addresses into the local startKeys[] array.
+ *
+ * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
+ * the next primitive index scan (for scans with array keys) based in part
+ * on an understanding of how it'll enable us to reposition the scan.
+ * They're directly aware of how we'll sometimes cons up an explicit
+ * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
+ * symmetric "deduce NOT NULL" rule of their own. This allows top-level
+ * scans to skip large groups of NULLs through repeated deductions about
+ * key strictness (for a required inequality key) and whether NULLs in the
+ * key's index column are stored last or first (relative to non-NULLs).
+ * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
+ * need to be kept in sync.
*----------
*/
strat_total = BTEqualStrategyNumber;
@@ -1502,7 +1535,8 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
* We scan the current page starting at offnum and moving in the indicated
* direction. All items matching the scan keys are loaded into currPos.items.
* moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
- * that there can be no more matching tuples in the current scan direction.
+ * that there can be no more matching tuples in the current scan direction
+ * (could just be for the current primitive index scan when scan has arrays).
*
* _bt_first caller passes us an offnum returned by _bt_binsrch, which might
* be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
@@ -1527,11 +1561,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
- int itemIndex;
- bool continuescan;
- int indnatts;
- bool continuescanPrechecked;
- bool haveFirstMatch = false;
+ BTReadPageState pstate;
+ bool arrayKeys;
+ int itemIndex,
+ indnatts;
/*
* We must have the buffer pinned and locked, but the usual macro can't be
@@ -1546,16 +1579,32 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
if (scan->parallel_scan)
{
if (ScanDirectionIsForward(dir))
- _bt_parallel_release(scan, opaque->btpo_next);
+ pstate.prev_scan_page = opaque->btpo_next;
else
- _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+ pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
+
+ _bt_parallel_release(scan, pstate.prev_scan_page);
}
- continuescan = true; /* default assumption */
indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
+ arrayKeys = so->numArrayKeys != 0;
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
+ /* initialize page-level state that we'll pass to _bt_checkkeys */
+ pstate.dir = dir;
+ pstate.minoff = minoff;
+ pstate.maxoff = maxoff;
+ pstate.finaltup = NULL;
+ pstate.page = page;
+ pstate.offnum = InvalidOffsetNumber;
+ pstate.skip = InvalidOffsetNumber;
+ pstate.continuescan = true; /* default assumption */
+ pstate.prechecked = false;
+ pstate.firstmatch = false;
+ pstate.rechecks = 0;
+ pstate.targetdistance = 0;
+
/*
* We note the buffer's block number so that we can release the pin later.
* This allows us to re-read the buffer if it is needed again for hinting.
@@ -1598,10 +1647,34 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
* corresponding value from the last item on the page. So checking with
* the last item on the page would give a more precise answer.
*
- * We skip this for the first page in the scan to evade the possible
- * slowdown of the point queries.
+ * We skip this for the first page read by each (primitive) scan, to avoid
+ * slowing down point queries. They typically don't stand to gain much
+ * when the optimization can be applied, and are more likely to notice the
+ * overhead of the precheck.
+ *
+ * The optimization is unsafe and must be avoided whenever _bt_checkkeys
+ * just set a low-order required array's key to the best available match
+ * for a truncated -inf attribute value from the prior page's high key
+ * (array element 0 is always the best available match in this scenario).
+ * It's quite likely that matches for array element 0 begin on this page,
+ * but the start of matches won't necessarily align with page boundaries.
+ * When the start of matches is somewhere in the middle of this page, it
+ * would be wrong to treat page's final non-pivot tuple as representative.
+ * Doing so might lead us to treat some of the page's earlier tuples as
+ * being part of a group of tuples thought to satisfy the required keys.
+ *
+ * Note: Conversely, in the case where the scan's arrays just advanced
+ * using the prior page's HIKEY _without_ advancement setting scanBehind,
+ * the start of matches must be aligned with page boundaries, which makes
+ * it safe to attempt the optimization here now. It's also safe when the
+ * prior page's HIKEY simply didn't need to advance any required array. In
+ * both cases we can safely assume that the _first_ tuple from this page
+ * must be >= the current set of array keys/equality constraints. And so
+ * if the final tuple is == those same keys (and also satisfies any
+ * required < or <= strategy scan keys) during the precheck, we can safely
+ * assume that this must also be true of all earlier tuples from the page.
*/
- if (!firstPage && minoff < maxoff)
+ if (!firstPage && !so->scanBehind && minoff < maxoff)
{
ItemId iid;
IndexTuple itup;
@@ -1609,22 +1682,22 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
itup = (IndexTuple) PageGetItem(page, iid);
- /*
- * Do the precheck. Note that we pass the pointer to the
- * 'continuescanPrechecked' to the 'continuescan' argument. That will
- * set flag to true if all required keys are satisfied and false
- * otherwise.
- */
- (void) _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescanPrechecked, false, false);
- }
- else
- {
- continuescanPrechecked = false;
+ /* Call with arrayKeys=false to avoid undesirable side-effects */
+ _bt_checkkeys(scan, &pstate, false, itup, indnatts);
+ pstate.prechecked = pstate.continuescan;
+ pstate.continuescan = true; /* reset */
}
if (ScanDirectionIsForward(dir))
{
+ /* SK_SEARCHARRAY forward scans must provide high key up front */
+ if (arrayKeys && !P_RIGHTMOST(opaque))
+ {
+ ItemId iid = PageGetItemId(page, P_HIKEY);
+
+ pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+ }
+
/* load items[] in ascending order */
itemIndex = 0;
@@ -1649,23 +1722,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
itup = (IndexTuple) PageGetItem(page, iid);
Assert(!BTreeTupleIsPivot(itup));
- passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescan,
- continuescanPrechecked,
- haveFirstMatch);
+ pstate.offnum = offnum;
+ passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
+ itup, indnatts);
/*
- * If the result of prechecking required keys was true, then in
- * assert-enabled builds we also recheck that the _bt_checkkeys()
- * result is the same.
+ * Check if we need to skip ahead to a later tuple (only possible
+ * when the scan uses array keys)
*/
- Assert((!continuescanPrechecked && haveFirstMatch) ||
- passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescan, false, false));
+ if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+ {
+ Assert(!passes_quals && pstate.continuescan);
+ Assert(offnum < pstate.skip);
+
+ offnum = pstate.skip;
+ pstate.skip = InvalidOffsetNumber;
+ continue;
+ }
+
if (passes_quals)
{
/* tuple passes all scan key conditions */
- haveFirstMatch = true;
+ pstate.firstmatch = true;
if (!BTreeTupleIsPosting(itup))
{
/* Remember it */
@@ -1696,7 +1774,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
}
}
/* When !continuescan, there can't be any more matches, so stop */
- if (!continuescan)
+ if (!pstate.continuescan)
break;
offnum = OffsetNumberNext(offnum);
@@ -1713,17 +1791,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
* only appear on non-pivot tuples on the right sibling page are
* common.
*/
- if (continuescan && !P_RIGHTMOST(opaque))
+ if (pstate.continuescan && !P_RIGHTMOST(opaque))
{
ItemId iid = PageGetItemId(page, P_HIKEY);
IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
int truncatt;
truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
- _bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false, false);
+ pstate.prechecked = false; /* precheck didn't cover HIKEY */
+ _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
}
- if (!continuescan)
+ if (!pstate.continuescan)
so->currPos.moreRight = false;
Assert(itemIndex <= MaxTIDsPerBTreePage);
@@ -1733,6 +1812,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
}
else
{
+ /* SK_SEARCHARRAY backward scans must provide final tuple up front */
+ if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
+ {
+ ItemId iid = PageGetItemId(page, minoff);
+
+ pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+ }
+
/* load items[] in descending order */
itemIndex = MaxTIDsPerBTreePage;
@@ -1772,23 +1859,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
itup = (IndexTuple) PageGetItem(page, iid);
Assert(!BTreeTupleIsPivot(itup));
- passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescan,
- continuescanPrechecked,
- haveFirstMatch);
+ pstate.offnum = offnum;
+ passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
+ itup, indnatts);
/*
- * If the result of prechecking required keys was true, then in
- * assert-enabled builds we also recheck that the _bt_checkkeys()
- * result is the same.
+ * Check if we need to skip ahead to a later tuple (only possible
+ * when the scan uses array keys)
*/
- Assert((!continuescanPrechecked && !haveFirstMatch) ||
- passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
- &continuescan, false, false));
+ if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+ {
+ Assert(!passes_quals && pstate.continuescan);
+ Assert(offnum > pstate.skip);
+
+ offnum = pstate.skip;
+ pstate.skip = InvalidOffsetNumber;
+ continue;
+ }
+
if (passes_quals && tuple_alive)
{
/* tuple passes all scan key conditions */
- haveFirstMatch = true;
+ pstate.firstmatch = true;
if (!BTreeTupleIsPosting(itup))
{
/* Remember it */
@@ -1824,7 +1916,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
}
}
}
- if (!continuescan)
+ if (!pstate.continuescan)
{
/* there can't be any more matches, so stop */
so->currPos.moreLeft = false;
@@ -1970,6 +2062,31 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
so->currPos.nextTupleOffset);
so->markPos.itemIndex = so->markItemIndex;
so->markItemIndex = -1;
+
+ /*
+ * If we're just about to start the next primitive index scan
+ * (possible with a scan that has arrays keys, and needs to skip to
+ * continue in the current scan direction), moreLeft/moreRight only
+ * indicate the end of the current primitive index scan. They must
+ * never be taken to indicate that the top-level index scan has ended
+ * (that would be wrong).
+ *
+ * We could handle this case by treating the current array keys as
+ * markPos state. But depending on the current array state like this
+ * would add complexity. Instead, we just unset markPos's copy of
+ * moreRight or moreLeft (whichever might be affected), while making
+ * btrestpos reset the scan's arrays to their initial scan positions.
+ * In effect, btrestpos leaves advancing the arrays up to the first
+ * _bt_readpage call (that takes place after it has restored markPos).
+ */
+ Assert(so->markPos.dir == dir);
+ if (so->needPrimScan)
+ {
+ if (ScanDirectionIsForward(dir))
+ so->markPos.moreRight = true;
+ else
+ so->markPos.moreLeft = true;
+ }
}
if (ScanDirectionIsForward(dir))
@@ -1981,7 +2098,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
* Seize the scan to get the next block number; if the scan has
* ended already, bail out.
*/
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, false);
if (!status)
{
/* release the previous buffer, if pinned */
@@ -2013,7 +2130,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
* Seize the scan to get the current block number; if the scan has
* ended already, bail out.
*/
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, false);
BTScanPosUnpinIfPinned(so->currPos);
if (!status)
{
@@ -2097,7 +2214,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
if (scan->parallel_scan != NULL)
{
_bt_relbuf(rel, so->currPos.buf);
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, false);
if (!status)
{
BTScanPosInvalidate(so->currPos);
@@ -2193,7 +2310,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
if (scan->parallel_scan != NULL)
{
_bt_relbuf(rel, so->currPos.buf);
- status = _bt_parallel_seize(scan, &blkno);
+ status = _bt_parallel_seize(scan, &blkno, false);
if (!status)
{
BTScanPosInvalidate(so->currPos);
@@ -2218,6 +2335,8 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Assert(!so->needPrimScan);
+
_bt_initialize_more_data(so, dir);
if (!_bt_readnextpage(scan, blkno, dir))
@@ -2524,14 +2643,22 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
}
/*
- * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
- * for scan direction
+ * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
+ * from currPos
*/
static inline void
_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
{
- /* initialize moreLeft/moreRight appropriately for scan direction */
- if (ScanDirectionIsForward(dir))
+ so->currPos.dir = dir;
+ if (so->needPrimScan)
+ {
+ Assert(so->numArrayKeys);
+
+ so->currPos.moreLeft = true;
+ so->currPos.moreRight = true;
+ so->needPrimScan = false;
+ }
+ else if (ScanDirectionIsForward(dir))
{
so->currPos.moreLeft = false;
so->currPos.moreRight = true;
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index d50317096d..e963de78a7 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -29,29 +29,77 @@
#include "utils/memutils.h"
#include "utils/rel.h"
+#define LOOK_AHEAD_REQUIRED_RECHECKS 3
+#define LOOK_AHEAD_DEFAULT_DISTANCE 5
typedef struct BTSortArrayContext
{
- FmgrInfo flinfo;
+ FmgrInfo *sortproc;
Oid collation;
bool reverse;
} BTSortArrayContext;
+typedef struct BTScanKeyPreproc
+{
+ ScanKey skey;
+ int ikey;
+ int arrayidx;
+} BTScanKeyPreproc;
+
+static void _bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype,
+ FmgrInfo *orderproc, FmgrInfo **sortprocp);
static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
- StrategyNumber strat,
+ Oid elemtype, StrategyNumber strat,
Datum *elems, int nelems);
-static int _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
- bool reverse,
- Datum *elems, int nelems);
+static int _bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc,
+ bool reverse, Datum *elems, int nelems);
+static bool _bt_merge_arrays(IndexScanDesc scan, ScanKey skey,
+ FmgrInfo *sortproc, bool reverse,
+ Oid origelemtype, Oid nextelemtype,
+ Datum *elems_orig, int *nelems_orig,
+ Datum *elems_next, int nelems_next);
+static bool _bt_compare_array_scankey_args(IndexScanDesc scan,
+ ScanKey arraysk, ScanKey skey,
+ FmgrInfo *orderproc, BTArrayKeyInfo *array,
+ bool *qual_ok);
+static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan);
+static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap);
static int _bt_compare_array_elements(const void *a, const void *b, void *arg);
+static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc,
+ Datum tupdatum, bool tupnull,
+ Datum arrdatum, ScanKey cur);
+static int _bt_binsrch_array_skey(FmgrInfo *orderproc,
+ bool cur_elem_trig, ScanDirection dir,
+ Datum tupdatum, bool tupnull,
+ BTArrayKeyInfo *array, ScanKey cur,
+ int32 *set_elem_result);
+static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir);
+static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
+ IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
+ bool readpagetup, int sktrig, bool *scanBehind);
+static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
+ IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+ int sktrig, bool sktrig_required);
+#ifdef USE_ASSERT_CHECKING
+static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan);
+#endif
static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
ScanKey leftarg, ScanKey rightarg,
+ BTArrayKeyInfo *array, FmgrInfo *orderproc,
bool *result);
static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
static void _bt_mark_scankey_required(ScanKey skey);
+static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
+ IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+ bool advancenonrequired, bool prechecked, bool firstmatch,
+ bool *continuescan, int *ikey);
static bool _bt_check_rowcompare(ScanKey skey,
IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
ScanDirection dir, bool *continuescan);
+static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
+ int tupnatts, TupleDesc tupdesc);
static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
IndexTuple firstright, BTScanInsert itup_key);
@@ -188,29 +236,55 @@ _bt_freestack(BTStack stack)
*
* If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
* set up BTArrayKeyInfo info for each one that is an equality-type key.
- * Prepare modified scan keys in so->arrayKeyData, which will hold the current
- * array elements during each primitive indexscan operation. For inequality
- * array keys, it's sufficient to find the extreme element value and replace
- * the whole array with that scalar value.
+ * Returns modified scan keys as input for further, standard preprocessing.
*
- * Note: the reason we need so->arrayKeyData, rather than just scribbling
- * on scan->keyData, is that callers are permitted to call btrescan without
- * supplying a new set of scankey data.
+ * Currently we perform two kinds of preprocessing to deal with redundancies.
+ * For inequality array keys, it's sufficient to find the extreme element
+ * value and replace the whole array with that scalar value. This eliminates
+ * all but one array element as redundant. Similarly, we are capable of
+ * "merging together" multiple equality array keys (from two or more input
+ * scan keys) into a single output scan key containing only the intersecting
+ * array elements. This can eliminate many redundant array elements, as well
+ * as eliminating whole array scan keys as redundant. It can also allow us to
+ * detect contradictory quals.
+ *
+ * It is convenient for _bt_preprocess_keys caller to have to deal with no
+ * more than one equality strategy array scan key per index attribute. We'll
+ * always be able to set things up that way when complete opfamilies are used.
+ * Eliminated array scan keys can be recognized as those that have had their
+ * sk_strategy field set to InvalidStrategy here by us. Caller should avoid
+ * including these in the scan's so->keyData[] output array.
+ *
+ * We set the scan key references from the scan's BTArrayKeyInfo info array to
+ * offsets into the temp modified input array returned to caller. Scans that
+ * have array keys should call _bt_preprocess_array_keys_final when standard
+ * preprocessing steps are complete. This will convert the scan key offset
+ * references into references to the scan's so->keyData[] output scan keys.
+ *
+ * Note: the reason we need to return a temp scan key array, rather than just
+ * scribbling on scan->keyData, is that callers are permitted to call btrescan
+ * without supplying a new set of scankey data.
*/
-void
+static ScanKey
_bt_preprocess_array_keys(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Relation rel = scan->indexRelation;
int numberOfKeys = scan->numberOfKeys;
- int16 *indoption = scan->indexRelation->rd_indoption;
+ int16 *indoption = rel->rd_indoption;
int numArrayKeys;
+ int origarrayatt = InvalidAttrNumber,
+ origarraykey = -1;
+ Oid origelemtype = InvalidOid;
ScanKey cur;
- int i;
MemoryContext oldContext;
+ ScanKey arrayKeyData; /* modified copy of scan->keyData */
+
+ Assert(numberOfKeys);
/* Quick check to see if there are any array keys */
numArrayKeys = 0;
- for (i = 0; i < numberOfKeys; i++)
+ for (int i = 0; i < numberOfKeys; i++)
{
cur = &scan->keyData[i];
if (cur->sk_flags & SK_SEARCHARRAY)
@@ -220,20 +294,15 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
/* If any arrays are null as a whole, we can quit right now. */
if (cur->sk_flags & SK_ISNULL)
{
- so->numArrayKeys = -1;
- so->arrayKeyData = NULL;
- return;
+ so->qual_ok = false;
+ return NULL;
}
}
}
/* Quit if nothing to do. */
if (numArrayKeys == 0)
- {
- so->numArrayKeys = 0;
- so->arrayKeyData = NULL;
- return;
- }
+ return NULL;
/*
* Make a scan-lifespan context to hold array-associated data, or reset it
@@ -249,18 +318,23 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
oldContext = MemoryContextSwitchTo(so->arrayContext);
/* Create modifiable copy of scan->keyData in the workspace context */
- so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
- memcpy(so->arrayKeyData,
- scan->keyData,
- scan->numberOfKeys * sizeof(ScanKeyData));
+ arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData));
+ memcpy(arrayKeyData, scan->keyData, numberOfKeys * sizeof(ScanKeyData));
/* Allocate space for per-array data in the workspace context */
- so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo));
+ so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo));
+
+ /* Allocate space for ORDER procs used to help _bt_checkkeys */
+ so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo));
/* Now process each array key */
numArrayKeys = 0;
- for (i = 0; i < numberOfKeys; i++)
+ for (int i = 0; i < numberOfKeys; i++)
{
+ FmgrInfo sortproc;
+ FmgrInfo *sortprocp = &sortproc;
+ Oid elemtype;
+ bool reverse;
ArrayType *arrayval;
int16 elmlen;
bool elmbyval;
@@ -271,7 +345,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
int num_nonnulls;
int j;
- cur = &so->arrayKeyData[i];
+ cur = &arrayKeyData[i];
if (!(cur->sk_flags & SK_SEARCHARRAY))
continue;
@@ -305,10 +379,21 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
/* If there's no non-nulls, the scan qual is unsatisfiable */
if (num_nonnulls == 0)
{
- numArrayKeys = -1;
+ so->qual_ok = false;
break;
}
+ /*
+ * Determine the nominal datatype of the array elements. We have to
+ * support the convention that sk_subtype == InvalidOid means the
+ * opclass input type; this is a hack to simplify life for
+ * ScanKeyInit().
+ */
+ elemtype = cur->sk_subtype;
+ if (elemtype == InvalidOid)
+ elemtype = rel->rd_opcintype[cur->sk_attno - 1];
+ Assert(elemtype == ARR_ELEMTYPE(arrayval));
+
/*
* If the comparison operator is not equality, then the array qual
* degenerates to a simple comparison against the smallest or largest
@@ -319,7 +404,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
case BTLessStrategyNumber:
case BTLessEqualStrategyNumber:
cur->sk_argument =
- _bt_find_extreme_element(scan, cur,
+ _bt_find_extreme_element(scan, cur, elemtype,
BTGreaterStrategyNumber,
elem_values, num_nonnulls);
continue;
@@ -329,7 +414,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
case BTGreaterEqualStrategyNumber:
case BTGreaterStrategyNumber:
cur->sk_argument =
- _bt_find_extreme_element(scan, cur,
+ _bt_find_extreme_element(scan, cur, elemtype,
BTLessStrategyNumber,
elem_values, num_nonnulls);
continue;
@@ -339,17 +424,93 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
break;
}
+ /*
+ * We'll need a 3-way ORDER proc to perform binary searches for the
+ * next matching array element. Set that up now.
+ *
+ * Array scan keys with cross-type equality operators will require a
+ * separate same-type ORDER proc for sorting their array. Otherwise,
+ * sortproc just points to the same proc used during binary searches.
+ */
+ _bt_setup_array_cmp(scan, cur, elemtype,
+ &so->orderProcs[i], &sortprocp);
+
/*
* Sort the non-null elements and eliminate any duplicates. We must
* sort in the same ordering used by the index column, so that the
- * successive primitive indexscans produce data in index order.
+ * arrays can be advanced in lockstep with the scan's progress through
+ * the index's key space.
*/
- num_elems = _bt_sort_array_elements(scan, cur,
- (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0,
+ reverse = (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0;
+ num_elems = _bt_sort_array_elements(cur, sortprocp, reverse,
elem_values, num_nonnulls);
+ if (origarrayatt == cur->sk_attno)
+ {
+ BTArrayKeyInfo *orig = &so->arrayKeys[origarraykey];
+
+ /*
+ * This array scan key is redundant with a previous equality
+ * operator array scan key. Merge the two arrays together to
+ * eliminate contradictory non-intersecting elements (or try to).
+ *
+ * We merge this next array back into attribute's original array.
+ */
+ Assert(arrayKeyData[orig->scan_key].sk_attno == cur->sk_attno);
+ Assert(arrayKeyData[orig->scan_key].sk_collation ==
+ cur->sk_collation);
+ if (_bt_merge_arrays(scan, cur, sortprocp, reverse,
+ origelemtype, elemtype,
+ orig->elem_values, &orig->num_elems,
+ elem_values, num_elems))
+ {
+ /* Successfully eliminated this array */
+ pfree(elem_values);
+
+ /*
+ * If no intersecting elements remain in the original array,
+ * the scan qual is unsatisfiable
+ */
+ if (orig->num_elems == 0)
+ {
+ so->qual_ok = false;
+ break;
+ }
+
+ /*
+ * Indicate to _bt_preprocess_keys caller that it must ignore
+ * this scan key
+ */
+ cur->sk_strategy = InvalidStrategy;
+ continue;
+ }
+
+ /*
+ * Unable to merge this array with previous array due to a lack of
+ * suitable cross-type opfamily support. Will need to keep both
+ * scan keys/arrays.
+ */
+ }
+ else
+ {
+ /*
+ * This array is the first for current index attribute.
+ *
+ * If it turns out to not be the last array (that is, if the next
+ * array is redundantly applied to this same index attribute),
+ * we'll then treat this array as the attribute's "original" array
+ * when merging.
+ */
+ origarrayatt = cur->sk_attno;
+ origarraykey = numArrayKeys;
+ origelemtype = elemtype;
+ }
+
/*
* And set up the BTArrayKeyInfo data.
+ *
+ * Note: _bt_preprocess_array_keys_final will fix-up each array's
+ * scan_key field later on, after so->keyData[] has been finalized.
*/
so->arrayKeys[numArrayKeys].scan_key = i;
so->arrayKeys[numArrayKeys].num_elems = num_elems;
@@ -360,6 +521,256 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
so->numArrayKeys = numArrayKeys;
MemoryContextSwitchTo(oldContext);
+
+ return arrayKeyData;
+}
+
+/*
+ * _bt_preprocess_array_keys_final() -- fix up array scan key references
+ *
+ * When _bt_preprocess_array_keys performed initial array preprocessing, it
+ * set each array's array->scan_key to the array's arrayKeys[] entry offset
+ * (that also work as references into the original scan->keyData[] array).
+ * This function handles translation of the scan key references from the
+ * BTArrayKeyInfo info array, from input scan key references (to the keys in
+ * scan->keyData[]), into output references (to the keys in so->keyData[]).
+ * Caller's keyDataMap[] array tells us how to perform this remapping.
+ *
+ * Also finalizes so->orderProcs[] for the scan. Arrays already have an ORDER
+ * proc, which might need to be repositioned to its so->keyData[]-wise offset
+ * (very much like the remapping that we apply to array->scan_key references).
+ * Non-array equality strategy scan keys (that survived preprocessing) don't
+ * yet have an so->orderProcs[] entry, so we set one for them here.
+ *
+ * Also converts single-element array scan keys into equivalent non-array
+ * equality scan keys, which decrements so->numArrayKeys. It's possible that
+ * this will leave this new btrescan without any arrays at all. This isn't
+ * necessary for correctness; it's just an optimization. Non-array equality
+ * scan keys are slightly faster than equivalent array scan keys at runtime.
+ */
+static void
+_bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Relation rel = scan->indexRelation;
+ int arrayidx = 0;
+ int last_equal_output_ikey PG_USED_FOR_ASSERTS_ONLY = -1;
+
+ Assert(so->qual_ok);
+ Assert(so->numArrayKeys);
+
+ for (int output_ikey = 0; output_ikey < so->numberOfKeys; output_ikey++)
+ {
+ ScanKey outkey = so->keyData + output_ikey;
+ int input_ikey;
+ bool found PG_USED_FOR_ASSERTS_ONLY = false;
+
+ Assert(outkey->sk_strategy != InvalidStrategy);
+
+ if (outkey->sk_strategy != BTEqualStrategyNumber)
+ continue;
+
+ input_ikey = keyDataMap[output_ikey];
+
+ Assert(last_equal_output_ikey < output_ikey);
+ Assert(last_equal_output_ikey < input_ikey);
+ last_equal_output_ikey = output_ikey;
+
+ /*
+ * We're lazy about looking up ORDER procs for non-array keys, since
+ * not all input keys become output keys. Take care of it now.
+ */
+ if (!(outkey->sk_flags & SK_SEARCHARRAY))
+ {
+ Oid elemtype;
+
+ /* No need for an ORDER proc given an IS NULL scan key */
+ if (outkey->sk_flags & SK_SEARCHNULL)
+ continue;
+
+ /*
+ * A non-required scan key doesn't need an ORDER proc, either
+ * (unless it's associated with an array, which this one isn't)
+ */
+ if (!(outkey->sk_flags & SK_BT_REQFWD))
+ continue;
+
+ elemtype = outkey->sk_subtype;
+ if (elemtype == InvalidOid)
+ elemtype = rel->rd_opcintype[outkey->sk_attno - 1];
+
+ _bt_setup_array_cmp(scan, outkey, elemtype,
+ &so->orderProcs[output_ikey], NULL);
+ continue;
+ }
+
+ /*
+ * Reorder existing array scan key so->orderProcs[] entries.
+ *
+ * Doing this in-place is safe because preprocessing is required to
+ * output all equality strategy scan keys in original input order
+ * (among each group of entries against the same index attribute).
+ * This is also the order that the arrays themselves appear in.
+ */
+ so->orderProcs[output_ikey] = so->orderProcs[input_ikey];
+
+ /* Fix-up array->scan_key references for arrays */
+ for (; arrayidx < so->numArrayKeys; arrayidx++)
+ {
+ BTArrayKeyInfo *array = &so->arrayKeys[arrayidx];
+
+ Assert(array->num_elems > 0);
+
+ if (array->scan_key == input_ikey)
+ {
+ /* found it */
+ array->scan_key = output_ikey;
+ found = true;
+
+ /*
+ * Transform array scan keys that have exactly 1 element
+ * remaining (following all prior preprocessing) into
+ * equivalent non-array scan keys.
+ */
+ if (array->num_elems == 1)
+ {
+ outkey->sk_flags &= ~SK_SEARCHARRAY;
+ outkey->sk_argument = array->elem_values[0];
+ so->numArrayKeys--;
+
+ /* If we're out of array keys, we can quit right away */
+ if (so->numArrayKeys == 0)
+ return;
+
+ /* Shift other arrays forward */
+ memmove(array, array + 1,
+ sizeof(BTArrayKeyInfo) *
+ (so->numArrayKeys - arrayidx));
+
+ /*
+ * Don't increment arrayidx (there was an entry that was
+ * just shifted forward to the offset at arrayidx, which
+ * will still need to be matched)
+ */
+ }
+ else
+ {
+ /* Match found, so done with this array */
+ arrayidx++;
+ }
+
+ break;
+ }
+ }
+
+ Assert(found);
+ }
+
+ /*
+ * Parallel index scans require space in shared memory to store the
+ * current array elements (for arrays kept by preprocessing) to schedule
+ * the next primitive index scan. The underlying structure is protected
+ * using a spinlock, so defensively limit its size. In practice this can
+ * only affect parallel scans that use an incomplete opfamily.
+ */
+ if (scan->parallel_scan && so->numArrayKeys > INDEX_MAX_KEYS)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg_internal("number of array scan keys left by preprocessing (%d) exceeds the maximum allowed by parallel btree index scans (%d)",
+ so->numArrayKeys, INDEX_MAX_KEYS)));
+}
+
+/*
+ * _bt_setup_array_cmp() -- Set up array comparison functions
+ *
+ * Sets ORDER proc in caller's orderproc argument, which is used during binary
+ * searches of arrays during the index scan. Also sets a same-type ORDER proc
+ * in caller's *sortprocp argument, which is used when sorting the array.
+ *
+ * Preprocessing calls here with all equality strategy scan keys (when scan
+ * uses equality array keys), including those not associated with any array.
+ * See _bt_advance_array_keys for an explanation of why it'll need to treat
+ * simple scalar equality scan keys as degenerate single element arrays.
+ *
+ * Caller should pass an orderproc pointing to space that'll store the ORDER
+ * proc for the scan, and a *sortprocp pointing to its own separate space.
+ * When calling here for a non-array scan key, sortprocp arg should be NULL.
+ *
+ * In the common case where we don't need to deal with cross-type operators,
+ * only one ORDER proc is actually required by caller. We'll set *sortprocp
+ * to point to the same memory that caller's orderproc continues to point to.
+ * Otherwise, *sortprocp will continue to point to caller's own space. Either
+ * way, *sortprocp will point to a same-type ORDER proc (since that's the only
+ * safe way to sort/deduplicate the array associated with caller's scan key).
+ */
+static void
+_bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype,
+ FmgrInfo *orderproc, FmgrInfo **sortprocp)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Relation rel = scan->indexRelation;
+ RegProcedure cmp_proc;
+ Oid opcintype = rel->rd_opcintype[skey->sk_attno - 1];
+
+ Assert(skey->sk_strategy == BTEqualStrategyNumber);
+ Assert(OidIsValid(elemtype));
+
+ /*
+ * If scankey operator is not a cross-type comparison, we can use the
+ * cached comparison function; otherwise gotta look it up in the catalogs
+ */
+ if (elemtype == opcintype)
+ {
+ /* Set same-type ORDER procs for caller */
+ *orderproc = *index_getprocinfo(rel, skey->sk_attno, BTORDER_PROC);
+ if (sortprocp)
+ *sortprocp = orderproc;
+
+ return;
+ }
+
+ /*
+ * Look up the appropriate cross-type comparison function in the opfamily.
+ *
+ * Use the opclass input type as the left hand arg type, and the array
+ * element type as the right hand arg type (since binary searches use an
+ * index tuple's attribute value to search for a matching array element).
+ *
+ * Note: it's possible that this would fail, if the opfamily is
+ * incomplete, but only in cases where it's quite likely that _bt_first
+ * would fail in just the same way (had we not failed before it could).
+ */
+ cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
+ opcintype, elemtype, BTORDER_PROC);
+ if (!RegProcedureIsValid(cmp_proc))
+ elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+ BTORDER_PROC, opcintype, elemtype, skey->sk_attno,
+ RelationGetRelationName(rel));
+
+ /* Set cross-type ORDER proc for caller */
+ fmgr_info_cxt(cmp_proc, orderproc, so->arrayContext);
+
+ /* Done if caller doesn't actually have an array they'll need to sort */
+ if (!sortprocp)
+ return;
+
+ /*
+ * Look up the appropriate same-type comparison function in the opfamily.
+ *
+ * Note: it's possible that this would fail, if the opfamily is
+ * incomplete, but it seems quite unlikely that an opfamily would omit
+ * non-cross-type comparison procs for any datatype that it supports at
+ * all.
+ */
+ cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
+ elemtype, elemtype, BTORDER_PROC);
+ if (!RegProcedureIsValid(cmp_proc))
+ elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+ BTORDER_PROC, elemtype, elemtype,
+ skey->sk_attno, RelationGetRelationName(rel));
+
+ /* Set same-type ORDER proc for caller */
+ fmgr_info_cxt(cmp_proc, *sortprocp, so->arrayContext);
}
/*
@@ -370,27 +781,17 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
* least element, or BTGreaterStrategyNumber to get the greatest.
*/
static Datum
-_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
+_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, Oid elemtype,
StrategyNumber strat,
Datum *elems, int nelems)
{
Relation rel = scan->indexRelation;
- Oid elemtype,
- cmp_op;
+ Oid cmp_op;
RegProcedure cmp_proc;
FmgrInfo flinfo;
Datum result;
int i;
- /*
- * Determine the nominal datatype of the array elements. We have to
- * support the convention that sk_subtype == InvalidOid means the opclass
- * input type; this is a hack to simplify life for ScanKeyInit().
- */
- elemtype = skey->sk_subtype;
- if (elemtype == InvalidOid)
- elemtype = rel->rd_opcintype[skey->sk_attno - 1];
-
/*
* Look up the appropriate comparison operator in the opfamily.
*
@@ -399,6 +800,8 @@ _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
* non-cross-type comparison operators for any datatype that it supports
* at all.
*/
+ Assert(skey->sk_strategy != BTEqualStrategyNumber);
+ Assert(OidIsValid(elemtype));
cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1],
elemtype,
elemtype,
@@ -433,50 +836,21 @@ _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
* The array elements are sorted in-place, and the new number of elements
* after duplicate removal is returned.
*
- * scan and skey identify the index column, whose opfamily determines the
- * comparison semantics. If reverse is true, we sort in descending order.
+ * skey identifies the index column whose opfamily determines the comparison
+ * semantics, and sortproc is a corresponding ORDER proc. If reverse is true,
+ * we sort in descending order.
*/
static int
-_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
- bool reverse,
+_bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc, bool reverse,
Datum *elems, int nelems)
{
- Relation rel = scan->indexRelation;
- Oid elemtype;
- RegProcedure cmp_proc;
BTSortArrayContext cxt;
if (nelems <= 1)
return nelems; /* no work to do */
- /*
- * Determine the nominal datatype of the array elements. We have to
- * support the convention that sk_subtype == InvalidOid means the opclass
- * input type; this is a hack to simplify life for ScanKeyInit().
- */
- elemtype = skey->sk_subtype;
- if (elemtype == InvalidOid)
- elemtype = rel->rd_opcintype[skey->sk_attno - 1];
-
- /*
- * Look up the appropriate comparison function in the opfamily.
- *
- * Note: it's possible that this would fail, if the opfamily is
- * incomplete, but it seems quite unlikely that an opfamily would omit
- * non-cross-type support functions for any datatype that it supports at
- * all.
- */
- cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
- elemtype,
- elemtype,
- BTORDER_PROC);
- if (!RegProcedureIsValid(cmp_proc))
- elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
- BTORDER_PROC, elemtype, elemtype,
- rel->rd_opfamily[skey->sk_attno - 1]);
-
/* Sort the array elements */
- fmgr_info(cmp_proc, &cxt.flinfo);
+ cxt.sortproc = sortproc;
cxt.collation = skey->sk_collation;
cxt.reverse = reverse;
qsort_arg(elems, nelems, sizeof(Datum),
@@ -487,6 +861,232 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
_bt_compare_array_elements, &cxt);
}
+/*
+ * _bt_merge_arrays() -- merge next array's elements into an original array
+ *
+ * Called when preprocessing encounters a pair of array equality scan keys,
+ * both against the same index attribute (during initial array preprocessing).
+ * Merging reorganizes caller's original array (the left hand arg) in-place,
+ * without ever copying elements from one array into the other. (Mixing the
+ * elements together like this would be wrong, since they don't necessarily
+ * use the same underlying element type, despite all the other similarities.)
+ *
+ * Both arrays must have already been sorted and deduplicated by calling
+ * _bt_sort_array_elements. sortproc is the same-type ORDER proc that was
+ * just used to sort and deduplicate caller's "next" array. We'll usually be
+ * able to reuse that order PROC to merge the arrays together now. If not,
+ * then we'll perform a separate ORDER proc lookup.
+ *
+ * If the opfamily doesn't supply a complete set of cross-type ORDER procs we
+ * may not be able to determine which elements are contradictory. If we have
+ * the required ORDER proc then we return true (and validly set *nelems_orig),
+ * guaranteeing that at least the next array can be considered redundant. We
+ * return false if the required comparisons cannot not be made (caller must
+ * keep both arrays when this happens).
+ */
+static bool
+_bt_merge_arrays(IndexScanDesc scan, ScanKey skey, FmgrInfo *sortproc,
+ bool reverse, Oid origelemtype, Oid nextelemtype,
+ Datum *elems_orig, int *nelems_orig,
+ Datum *elems_next, int nelems_next)
+{
+ Relation rel = scan->indexRelation;
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ BTSortArrayContext cxt;
+ int nelems_orig_start = *nelems_orig,
+ nelems_orig_merged = 0;
+ FmgrInfo *mergeproc = sortproc;
+ FmgrInfo crosstypeproc;
+
+ Assert(skey->sk_strategy == BTEqualStrategyNumber);
+ Assert(OidIsValid(origelemtype) && OidIsValid(nextelemtype));
+
+ if (origelemtype != nextelemtype)
+ {
+ RegProcedure cmp_proc;
+
+ /*
+ * Cross-array-element-type merging is required, so can't just reuse
+ * sortproc when merging
+ */
+ cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
+ origelemtype, nextelemtype, BTORDER_PROC);
+ if (!RegProcedureIsValid(cmp_proc))
+ {
+ /* Can't make the required comparisons */
+ return false;
+ }
+
+ /* We have all we need to determine redundancy/contradictoriness */
+ mergeproc = &crosstypeproc;
+ fmgr_info_cxt(cmp_proc, mergeproc, so->arrayContext);
+ }
+
+ cxt.sortproc = mergeproc;
+ cxt.collation = skey->sk_collation;
+ cxt.reverse = reverse;
+
+ for (int i = 0, j = 0; i < nelems_orig_start && j < nelems_next;)
+ {
+ Datum *oelem = elems_orig + i,
+ *nelem = elems_next + j;
+ int res = _bt_compare_array_elements(oelem, nelem, &cxt);
+
+ if (res == 0)
+ {
+ elems_orig[nelems_orig_merged++] = *oelem;
+ i++;
+ j++;
+ }
+ else if (res < 0)
+ i++;
+ else /* res > 0 */
+ j++;
+ }
+
+ *nelems_orig = nelems_orig_merged;
+
+ return true;
+}
+
+/*
+ * Compare an array scan key to a scalar scan key, eliminating contradictory
+ * array elements such that the scalar scan key becomes redundant.
+ *
+ * Array elements can be eliminated as contradictory when excluded by some
+ * other operator on the same attribute. For example, with an index scan qual
+ * "WHERE a IN (1, 2, 3) AND a < 2", all array elements except the value "1"
+ * are eliminated, and the < scan key is eliminated as redundant. Cases where
+ * every array element is eliminated by a redundant scalar scan key have an
+ * unsatisfiable qual, which we handle by setting *qual_ok=false for caller.
+ *
+ * If the opfamily doesn't supply a complete set of cross-type ORDER procs we
+ * may not be able to determine which elements are contradictory. If we have
+ * the required ORDER proc then we return true (and validly set *qual_ok),
+ * guaranteeing that at least the scalar scan key can be considered redundant.
+ * We return false if the comparison could not be made (caller must keep both
+ * scan keys when this happens).
+ */
+static bool
+_bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey,
+ FmgrInfo *orderproc, BTArrayKeyInfo *array,
+ bool *qual_ok)
+{
+ Relation rel = scan->indexRelation;
+ Oid opcintype = rel->rd_opcintype[arraysk->sk_attno - 1];
+ int cmpresult = 0,
+ cmpexact = 0,
+ matchelem,
+ new_nelems = 0;
+ FmgrInfo crosstypeproc;
+ FmgrInfo *orderprocp = orderproc;
+
+ Assert(arraysk->sk_attno == skey->sk_attno);
+ Assert(array->num_elems > 0);
+ Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER)));
+ Assert((arraysk->sk_flags & SK_SEARCHARRAY) &&
+ arraysk->sk_strategy == BTEqualStrategyNumber);
+ Assert(!(skey->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER)));
+ Assert(!(skey->sk_flags & SK_SEARCHARRAY) ||
+ skey->sk_strategy != BTEqualStrategyNumber);
+
+ /*
+ * _bt_binsrch_array_skey searches an array for the entry best matching a
+ * datum of opclass input type for the index's attribute (on-disk type).
+ * We can reuse the array's ORDER proc whenever the non-array scan key's
+ * type is a match for the corresponding attribute's input opclass type.
+ * Otherwise, we have to do another ORDER proc lookup so that our call to
+ * _bt_binsrch_array_skey applies the correct comparator.
+ *
+ * Note: we have to support the convention that sk_subtype == InvalidOid
+ * means the opclass input type; this is a hack to simplify life for
+ * ScanKeyInit().
+ */
+ if (skey->sk_subtype != opcintype && skey->sk_subtype != InvalidOid)
+ {
+ RegProcedure cmp_proc;
+ Oid arraysk_elemtype;
+
+ /*
+ * Need an ORDER proc lookup to detect redundancy/contradictoriness
+ * with this pair of scankeys.
+ *
+ * Scalar scan key's argument will be passed to _bt_compare_array_skey
+ * as its tupdatum/lefthand argument (rhs arg is for array elements).
+ */
+ arraysk_elemtype = arraysk->sk_subtype;
+ if (arraysk_elemtype == InvalidOid)
+ arraysk_elemtype = rel->rd_opcintype[arraysk->sk_attno - 1];
+ cmp_proc = get_opfamily_proc(rel->rd_opfamily[arraysk->sk_attno - 1],
+ skey->sk_subtype, arraysk_elemtype,
+ BTORDER_PROC);
+ if (!RegProcedureIsValid(cmp_proc))
+ {
+ /* Can't make the comparison */
+ *qual_ok = false; /* suppress compiler warnings */
+ return false;
+ }
+
+ /* We have all we need to determine redundancy/contradictoriness */
+ orderprocp = &crosstypeproc;
+ fmgr_info(cmp_proc, orderprocp);
+ }
+
+ matchelem = _bt_binsrch_array_skey(orderprocp, false,
+ NoMovementScanDirection,
+ skey->sk_argument, false, array,
+ arraysk, &cmpresult);
+
+ switch (skey->sk_strategy)
+ {
+ case BTLessStrategyNumber:
+ cmpexact = 1; /* exclude exact match, if any */
+ /* FALL THRU */
+ case BTLessEqualStrategyNumber:
+ if (cmpresult >= cmpexact)
+ matchelem++;
+ /* Resize, keeping elements from the start of the array */
+ new_nelems = matchelem;
+ break;
+ case BTEqualStrategyNumber:
+ if (cmpresult != 0)
+ {
+ /* qual is unsatisfiable */
+ new_nelems = 0;
+ }
+ else
+ {
+ /* Shift matching element to the start of the array, resize */
+ array->elem_values[0] = array->elem_values[matchelem];
+ new_nelems = 1;
+ }
+ break;
+ case BTGreaterEqualStrategyNumber:
+ cmpexact = 1; /* include exact match, if any */
+ /* FALL THRU */
+ case BTGreaterStrategyNumber:
+ if (cmpresult >= cmpexact)
+ matchelem++;
+ /* Shift matching elements to the start of the array, resize */
+ new_nelems = array->num_elems - matchelem;
+ memmove(array->elem_values, array->elem_values + matchelem,
+ sizeof(Datum) * new_nelems);
+ break;
+ default:
+ elog(ERROR, "unrecognized StrategyNumber: %d",
+ (int) skey->sk_strategy);
+ break;
+ }
+
+ Assert(new_nelems >= 0);
+ Assert(new_nelems <= array->num_elems);
+
+ array->num_elems = new_nelems;
+ *qual_ok = new_nelems > 0;
+
+ return true;
+}
+
/*
* qsort_arg comparator for sorting array elements
*/
@@ -498,7 +1098,7 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg)
BTSortArrayContext *cxt = (BTSortArrayContext *) arg;
int32 compare;
- compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo,
+ compare = DatumGetInt32(FunctionCall2Coll(cxt->sortproc,
cxt->collation,
da, db));
if (cxt->reverse)
@@ -506,11 +1106,233 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg)
return compare;
}
+/*
+ * _bt_compare_array_skey() -- apply array comparison function
+ *
+ * Compares caller's tuple attribute value to a scan key/array element.
+ * Helper function used during binary searches of SK_SEARCHARRAY arrays.
+ *
+ * This routine returns:
+ * <0 if tupdatum < arrdatum;
+ * 0 if tupdatum == arrdatum;
+ * >0 if tupdatum > arrdatum.
+ *
+ * This is essentially the same interface as _bt_compare: both functions
+ * compare the value that they're searching for to a binary search pivot.
+ * However, unlike _bt_compare, this function's "tuple argument" comes first,
+ * while its "array/scankey argument" comes second.
+*/
+static inline int32
+_bt_compare_array_skey(FmgrInfo *orderproc,
+ Datum tupdatum, bool tupnull,
+ Datum arrdatum, ScanKey cur)
+{
+ int32 result = 0;
+
+ Assert(cur->sk_strategy == BTEqualStrategyNumber);
+
+ if (tupnull) /* NULL tupdatum */
+ {
+ if (cur->sk_flags & SK_ISNULL)
+ result = 0; /* NULL "=" NULL */
+ else if (cur->sk_flags & SK_BT_NULLS_FIRST)
+ result = -1; /* NULL "<" NOT_NULL */
+ else
+ result = 1; /* NULL ">" NOT_NULL */
+ }
+ else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */
+ {
+ if (cur->sk_flags & SK_BT_NULLS_FIRST)
+ result = 1; /* NOT_NULL ">" NULL */
+ else
+ result = -1; /* NOT_NULL "<" NULL */
+ }
+ else
+ {
+ /*
+ * Like _bt_compare, we need to be careful of cross-type comparisons,
+ * so the left value has to be the value that came from an index tuple
+ */
+ result = DatumGetInt32(FunctionCall2Coll(orderproc, cur->sk_collation,
+ tupdatum, arrdatum));
+
+ /*
+ * We flip the sign by following the obvious rule: flip whenever the
+ * column is a DESC column.
+ *
+ * _bt_compare does it the wrong way around (flip when *ASC*) in order
+ * to compensate for passing its orderproc arguments backwards. We
+ * don't need to play these games because we find it natural to pass
+ * tupdatum as the left value (and arrdatum as the right value).
+ */
+ if (cur->sk_flags & SK_BT_DESC)
+ INVERT_COMPARE_RESULT(result);
+ }
+
+ return result;
+}
+
+/*
+ * _bt_binsrch_array_skey() -- Binary search for next matching array key
+ *
+ * Returns an index to the first array element >= caller's tupdatum argument.
+ * This convention is more natural for forwards scan callers, but that can't
+ * really matter to backwards scan callers. Both callers require handling for
+ * the case where the match we return is < tupdatum, and symmetric handling
+ * for the case where our best match is > tupdatum.
+ *
+ * Also sets *set_elem_result to the result _bt_compare_array_skey returned
+ * when we used it to compare the matching array element to tupdatum/tupnull.
+ *
+ * cur_elem_trig indicates if array advancement was triggered by this array's
+ * scan key, and that the array is for a required scan key. We can apply this
+ * information to find the next matching array element in the current scan
+ * direction using far fewer comparisons (fewer on average, compared to naive
+ * binary search). This scheme takes advantage of an important property of
+ * required arrays: required arrays always advance in lockstep with the index
+ * scan's progress through the index's key space.
+ */
+static int
+_bt_binsrch_array_skey(FmgrInfo *orderproc,
+ bool cur_elem_trig, ScanDirection dir,
+ Datum tupdatum, bool tupnull,
+ BTArrayKeyInfo *array, ScanKey cur,
+ int32 *set_elem_result)
+{
+ int low_elem = 0,
+ mid_elem = -1,
+ high_elem = array->num_elems - 1,
+ result = 0;
+ Datum arrdatum;
+
+ Assert(cur->sk_flags & SK_SEARCHARRAY);
+ Assert(cur->sk_strategy == BTEqualStrategyNumber);
+
+ if (cur_elem_trig)
+ {
+ Assert(!ScanDirectionIsNoMovement(dir));
+ Assert(cur->sk_flags & SK_BT_REQFWD);
+
+ /*
+ * When the scan key that triggered array advancement is a required
+ * array scan key, it is now certain that the current array element
+ * (plus all prior elements relative to the current scan direction)
+ * cannot possibly be at or ahead of the corresponding tuple value.
+ * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which
+ * makes sure this is true as a condition of advancing the arrays.)
+ *
+ * This makes it safe to exclude array elements up to and including
+ * the former-current array element from our search.
+ *
+ * Separately, when array advancement was triggered by a required scan
+ * key, the array element immediately after the former-current element
+ * is often either an exact tupdatum match, or a "close by" near-match
+ * (a near-match tupdatum is one whose key space falls _between_ the
+ * former-current and new-current array elements). We'll detect both
+ * cases via an optimistic comparison of the new search lower bound
+ * (or new search upper bound in the case of backwards scans).
+ */
+ if (ScanDirectionIsForward(dir))
+ {
+ low_elem = array->cur_elem + 1; /* old cur_elem exhausted */
+
+ /* Compare prospective new cur_elem (also the new lower bound) */
+ if (high_elem >= low_elem)
+ {
+ arrdatum = array->elem_values[low_elem];
+ result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
+ arrdatum, cur);
+
+ if (result <= 0)
+ {
+ /* Optimistic comparison optimization worked out */
+ *set_elem_result = result;
+ return low_elem;
+ }
+ mid_elem = low_elem;
+ low_elem++; /* this cur_elem exhausted, too */
+ }
+
+ if (high_elem < low_elem)
+ {
+ /* Caller needs to perform "beyond end" array advancement */
+ *set_elem_result = 1;
+ return high_elem;
+ }
+ }
+ else
+ {
+ high_elem = array->cur_elem - 1; /* old cur_elem exhausted */
+
+ /* Compare prospective new cur_elem (also the new upper bound) */
+ if (high_elem >= low_elem)
+ {
+ arrdatum = array->elem_values[high_elem];
+ result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
+ arrdatum, cur);
+
+ if (result >= 0)
+ {
+ /* Optimistic comparison optimization worked out */
+ *set_elem_result = result;
+ return high_elem;
+ }
+ mid_elem = high_elem;
+ high_elem--; /* this cur_elem exhausted, too */
+ }
+
+ if (high_elem < low_elem)
+ {
+ /* Caller needs to perform "beyond end" array advancement */
+ *set_elem_result = -1;
+ return low_elem;
+ }
+ }
+ }
+
+ while (high_elem > low_elem)
+ {
+ mid_elem = low_elem + ((high_elem - low_elem) / 2);
+ arrdatum = array->elem_values[mid_elem];
+
+ result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
+ arrdatum, cur);
+
+ if (result == 0)
+ {
+ /*
+ * It's safe to quit as soon as we see an equal array element.
+ * This often saves an extra comparison or two...
+ */
+ low_elem = mid_elem;
+ break;
+ }
+
+ if (result > 0)
+ low_elem = mid_elem + 1;
+ else
+ high_elem = mid_elem;
+ }
+
+ /*
+ * ...but our caller also cares about how its searched-for tuple datum
+ * compares to the low_elem datum. Must always set *set_elem_result with
+ * the result of that comparison specifically.
+ */
+ if (low_elem != mid_elem)
+ result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
+ array->elem_values[low_elem], cur);
+
+ *set_elem_result = result;
+
+ return low_elem;
+}
+
/*
* _bt_start_array_keys() -- Initialize array keys at start of a scan
*
* Set up the cur_elem counters and fill in the first sk_argument value for
- * each array scankey. We can't do this until we know the scan direction.
+ * each array scankey.
*/
void
_bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
@@ -518,159 +1340,1132 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
BTScanOpaque so = (BTScanOpaque) scan->opaque;
int i;
+ Assert(so->numArrayKeys);
+ Assert(so->qual_ok);
+
for (i = 0; i < so->numArrayKeys; i++)
{
BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
- ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
+ ScanKey skey = &so->keyData[curArrayKey->scan_key];
Assert(curArrayKey->num_elems > 0);
+ Assert(skey->sk_flags & SK_SEARCHARRAY);
+
if (ScanDirectionIsBackward(dir))
curArrayKey->cur_elem = curArrayKey->num_elems - 1;
else
curArrayKey->cur_elem = 0;
skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
}
-
- so->arraysStarted = true;
+ so->scanBehind = false;
}
/*
- * _bt_advance_array_keys() -- Advance to next set of array elements
+ * _bt_advance_array_keys_increment() -- Advance to next set of array elements
+ *
+ * Advances the array keys by a single increment in the current scan
+ * direction. When there are multiple array keys this can roll over from the
+ * lowest order array to higher order arrays.
*
* Returns true if there is another set of values to consider, false if not.
* On true result, the scankeys are initialized with the next set of values.
+ * On false result, the scankeys stay the same, and the array keys are not
+ * advanced (every array remains at its final element for scan direction).
*/
-bool
-_bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
+static bool
+_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- bool found = false;
- int i;
/*
* We must advance the last array key most quickly, since it will
* correspond to the lowest-order index column among the available
- * qualifications. This is necessary to ensure correct ordering of output
- * when there are multiple array keys.
+ * qualifications
*/
- for (i = so->numArrayKeys - 1; i >= 0; i--)
+ for (int i = so->numArrayKeys - 1; i >= 0; i--)
{
BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
- ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
+ ScanKey skey = &so->keyData[curArrayKey->scan_key];
int cur_elem = curArrayKey->cur_elem;
int num_elems = curArrayKey->num_elems;
+ bool rolled = false;
- if (ScanDirectionIsBackward(dir))
+ if (ScanDirectionIsForward(dir) && ++cur_elem >= num_elems)
{
- if (--cur_elem < 0)
- {
- cur_elem = num_elems - 1;
- found = false; /* need to advance next array key */
- }
- else
- found = true;
+ cur_elem = 0;
+ rolled = true;
}
- else
+ else if (ScanDirectionIsBackward(dir) && --cur_elem < 0)
{
- if (++cur_elem >= num_elems)
- {
- cur_elem = 0;
- found = false; /* need to advance next array key */
- }
- else
- found = true;
+ cur_elem = num_elems - 1;
+ rolled = true;
}
curArrayKey->cur_elem = cur_elem;
skey->sk_argument = curArrayKey->elem_values[cur_elem];
- if (found)
- break;
- }
+ if (!rolled)
+ return true;
- /* advance parallel scan */
- if (scan->parallel_scan != NULL)
- _bt_parallel_advance_array_keys(scan);
+ /* Need to advance next array key, if any */
+ }
/*
- * When no new array keys were found, the scan is "past the end" of the
- * array keys. _bt_start_array_keys can still "restart" the array keys if
- * a rescan is required.
+ * The array keys are now exhausted. (There isn't actually a distinct
+ * state that represents array exhaustion, since index scans don't always
+ * end after btgettuple returns "false".)
+ *
+ * Restore the array keys to the state they were in immediately before we
+ * were called. This ensures that the arrays only ever ratchet in the
+ * current scan direction. Without this, scans would overlook matching
+ * tuples if and when the scan's direction was subsequently reversed.
*/
- if (!found)
- so->arraysStarted = false;
+ _bt_start_array_keys(scan, -dir);
- return found;
+ return false;
}
/*
- * _bt_mark_array_keys() -- Handle array keys during btmarkpos
+ * _bt_rewind_nonrequired_arrays() -- Rewind non-required arrays
*
- * Save the current state of the array keys as the "mark" position.
+ * Called when _bt_advance_array_keys decides to start a new primitive index
+ * scan on the basis of the current scan position being before the position
+ * that _bt_first is capable of repositioning the scan to by applying an
+ * inequality operator required in the opposite-to-scan direction only.
+ *
+ * Although equality strategy scan keys (for both arrays and non-arrays alike)
+ * are either marked required in both directions or in neither direction,
+ * there is a sense in which non-required arrays behave like required arrays.
+ * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)",
+ * the scan key on "c" is non-required, but nevertheless enables positioning
+ * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the
+ * first descent of the tree by _bt_first. Later on, there could also be a
+ * second descent, that places the scan right before tuples >= "(200, 3, 5)".
+ * _bt_first must never be allowed to build an insertion scan key whose "c"
+ * entry is set to a value other than 5, the "c" array's first element/value.
+ * (Actually, it's the first in the current scan direction. This example uses
+ * a forward scan.)
+ *
+ * Calling here resets the array scan key elements for the scan's non-required
+ * arrays. This is strictly necessary for correctness in a subset of cases
+ * involving "required in opposite direction"-triggered primitive index scans.
+ * Not all callers are at risk of _bt_first using a non-required array like
+ * this, but advancement always resets the arrays when another primitive scan
+ * is scheduled, just to keep things simple. Array advancement even makes
+ * sure to reset non-required arrays during scans that have no inequalities.
+ * (Advancement still won't call here when there are no inequalities, though
+ * that's just because it's all handled indirectly instead.)
+ *
+ * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that
+ * everybody got this right.
*/
-void
-_bt_mark_array_keys(IndexScanDesc scan)
+static void
+_bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- int i;
+ int arrayidx = 0;
- for (i = 0; i < so->numArrayKeys; i++)
+ for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
{
- BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
+ ScanKey cur = so->keyData + ikey;
+ BTArrayKeyInfo *array = NULL;
+ int first_elem_dir;
- curArrayKey->mark_elem = curArrayKey->cur_elem;
+ if (!(cur->sk_flags & SK_SEARCHARRAY) ||
+ cur->sk_strategy != BTEqualStrategyNumber)
+ continue;
+
+ array = &so->arrayKeys[arrayidx++];
+ Assert(array->scan_key == ikey);
+
+ if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
+ continue;
+
+ if (ScanDirectionIsForward(dir))
+ first_elem_dir = 0;
+ else
+ first_elem_dir = array->num_elems - 1;
+
+ if (array->cur_elem != first_elem_dir)
+ {
+ array->cur_elem = first_elem_dir;
+ cur->sk_argument = array->elem_values[first_elem_dir];
+ }
}
}
/*
- * _bt_restore_array_keys() -- Handle array keys during btrestrpos
+ * _bt_tuple_before_array_skeys() -- too early to advance required arrays?
*
- * Restore the array keys to where they were when the mark was set.
+ * We always compare the tuple using the current array keys (which we assume
+ * are already set in so->keyData[]). readpagetup indicates if tuple is the
+ * scan's current _bt_readpage-wise tuple.
+ *
+ * readpagetup callers must only call here when _bt_check_compare already set
+ * continuescan=false. We help these callers deal with _bt_check_compare's
+ * inability to distinguishing between the < and > cases (it uses equality
+ * operator scan keys, whereas we use 3-way ORDER procs). These callers pass
+ * a _bt_check_compare-set sktrig value that indicates which scan key
+ * triggered the call (!readpagetup callers just pass us sktrig=0 instead).
+ * This information allows us to avoid wastefully checking earlier scan keys
+ * that were already deemed to have been satisfied inside _bt_check_compare.
+ *
+ * Returns false when caller's tuple is >= the current required equality scan
+ * keys (or <=, in the case of backwards scans). This happens to readpagetup
+ * callers when the scan has reached the point of needing its array keys
+ * advanced; caller will need to advance required and non-required arrays at
+ * scan key offsets >= sktrig, plus scan keys < sktrig iff sktrig rolls over.
+ * (When we return false to readpagetup callers, tuple can only be == current
+ * required equality scan keys when caller's sktrig indicates that the arrays
+ * need to be advanced due to an unsatisfied required inequality key trigger.)
+ *
+ * Returns true when caller passes a tuple that is < the current set of
+ * equality keys for the most significant non-equal required scan key/column
+ * (or > the keys, during backwards scans). This happens to readpagetup
+ * callers when tuple is still before the start of matches for the scan's
+ * required equality strategy scan keys. (sktrig can't have indicated that an
+ * inequality strategy scan key wasn't satisfied in _bt_check_compare when we
+ * return true. In fact, we automatically return false when passed such an
+ * inequality sktrig by readpagetup callers -- _bt_check_compare's initial
+ * continuescan=false doesn't really need to be confirmed here by us.)
+ *
+ * !readpagetup callers optionally pass us *scanBehind, which tracks whether
+ * any missing truncated attributes might have affected array advancement
+ * (compared to what would happen if it was shown the first non-pivot tuple on
+ * the page to the right of caller's finaltup/high key tuple instead). It's
+ * only possible that we'll set *scanBehind to true when caller passes us a
+ * pivot tuple (with truncated -inf attributes) that we return false for.
*/
-void
-_bt_restore_array_keys(IndexScanDesc scan)
+static bool
+_bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
+ IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
+ bool readpagetup, int sktrig, bool *scanBehind)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- bool changed = false;
- int i;
- /* Restore each array key to its position when the mark was set */
- for (i = 0; i < so->numArrayKeys; i++)
+ Assert(so->numArrayKeys);
+ Assert(so->numberOfKeys);
+ Assert(sktrig == 0 || readpagetup);
+ Assert(!readpagetup || scanBehind == NULL);
+
+ if (scanBehind)
+ *scanBehind = false;
+
+ for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++)
{
- BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
- ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
- int mark_elem = curArrayKey->mark_elem;
+ ScanKey cur = so->keyData + ikey;
+ Datum tupdatum;
+ bool tupnull;
+ int32 result;
- if (curArrayKey->cur_elem != mark_elem)
+ /* readpagetup calls require one ORDER proc comparison (at most) */
+ Assert(!readpagetup || ikey == sktrig);
+
+ /*
+ * Once we reach a non-required scan key, we're completely done.
+ *
+ * Note: we deliberately don't consider the scan direction here.
+ * _bt_advance_array_keys caller requires that we track *scanBehind
+ * without concern for scan direction.
+ */
+ if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) == 0)
{
- curArrayKey->cur_elem = mark_elem;
- skey->sk_argument = curArrayKey->elem_values[mark_elem];
- changed = true;
+ Assert(!readpagetup);
+ Assert(ikey > sktrig || ikey == 0);
+ return false;
+ }
+
+ if (cur->sk_attno > tupnatts)
+ {
+ Assert(!readpagetup);
+
+ /*
+ * When we reach a high key's truncated attribute, assume that the
+ * tuple attribute's value is >= the scan's equality constraint
+ * scan keys (but set *scanBehind to let interested callers know
+ * that a truncated attribute might have affected our answer).
+ */
+ if (scanBehind)
+ *scanBehind = true;
+
+ return false;
+ }
+
+ /*
+ * Deal with inequality strategy scan keys that _bt_check_compare set
+ * continuescan=false for
+ */
+ if (cur->sk_strategy != BTEqualStrategyNumber)
+ {
+ /*
+ * When _bt_check_compare indicated that a required inequality
+ * scan key wasn't satisfied, there's no need to verify anything;
+ * caller always calls _bt_advance_array_keys with this sktrig.
+ */
+ if (readpagetup)
+ return false;
+
+ /*
+ * Otherwise we can't give up, since we must check all required
+ * scan keys (required in either direction) in order to correctly
+ * track *scanBehind for caller
+ */
+ continue;
+ }
+
+ tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull);
+
+ result = _bt_compare_array_skey(&so->orderProcs[ikey],
+ tupdatum, tupnull,
+ cur->sk_argument, cur);
+
+ /*
+ * Does this comparison indicate that caller must _not_ advance the
+ * scan's arrays just yet?
+ */
+ if ((ScanDirectionIsForward(dir) && result < 0) ||
+ (ScanDirectionIsBackward(dir) && result > 0))
+ return true;
+
+ /*
+ * Does this comparison indicate that caller should now advance the
+ * scan's arrays? (Must be if we get here during a readpagetup call.)
+ */
+ if (readpagetup || result != 0)
+ {
+ Assert(result != 0);
+ return false;
+ }
+
+ /*
+ * Inconclusive -- need to check later scan keys, too.
+ *
+ * This must be a finaltup precheck, or a call made from an assertion.
+ */
+ Assert(result == 0);
+ }
+
+ Assert(!readpagetup);
+
+ return false;
+}
+
+/*
+ * _bt_start_prim_scan() -- start scheduled primitive index scan?
+ *
+ * Returns true if _bt_checkkeys scheduled another primitive index scan, just
+ * as the last one ended. Otherwise returns false, indicating that the array
+ * keys are now fully exhausted.
+ *
+ * Only call here during scans with one or more equality type array scan keys,
+ * after _bt_first or _bt_next return false.
+ */
+bool
+_bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+ Assert(so->numArrayKeys);
+
+ /* scanBehind flag doesn't persist across primitive index scans - reset */
+ so->scanBehind = false;
+
+ /*
+ * Array keys are advanced within _bt_checkkeys when the scan reaches the
+ * leaf level (more precisely, they're advanced when the scan reaches the
+ * end of each distinct set of array elements). This process avoids
+ * repeat access to leaf pages (across multiple primitive index scans) by
+ * advancing the scan's array keys when it allows the primitive index scan
+ * to find nearby matching tuples (or when it eliminates ranges of array
+ * key space that can't possibly be satisfied by any index tuple).
+ *
+ * _bt_checkkeys sets a simple flag variable to schedule another primitive
+ * index scan. The flag tells us what to do.
+ *
+ * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
+ * various cases where that won't happen. For example, if the index is
+ * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
+ * We also don't expect a call to _bt_checkkeys during searches for a
+ * non-existent value that happens to be lower/higher than any existing
+ * value in the index.
+ *
+ * We don't require special handling for these cases -- we don't need to
+ * be explicitly instructed to _not_ perform another primitive index scan.
+ * It's up to code under the control of _bt_first to always set the flag
+ * when another primitive index scan will be required.
+ *
+ * This works correctly, even with the tricky cases listed above, which
+ * all involve access to leaf pages "near the boundaries of the key space"
+ * (whether it's from a leftmost/rightmost page, or an imaginary empty
+ * leaf root page). If _bt_checkkeys cannot be reached by a primitive
+ * index scan for one set of array keys, then it also won't be reached for
+ * any later set ("later" in terms of the direction that we scan the index
+ * and advance the arrays). The array keys won't have advanced in these
+ * cases, but that's the correct behavior (even _bt_advance_array_keys
+ * won't always advance the arrays at the point they become "exhausted").
+ */
+ if (so->needPrimScan)
+ {
+ Assert(_bt_verify_arrays_bt_first(scan, dir));
+
+ /*
+ * Flag was set -- must call _bt_first again, which will reset the
+ * scan's needPrimScan flag
+ */
+ return true;
+ }
+
+ /* The top-level index scan ran out of tuples in this scan direction */
+ if (scan->parallel_scan != NULL)
+ _bt_parallel_done(scan);
+
+ return false;
+}
+
+/*
+ * _bt_advance_array_keys() -- Advance array elements using a tuple
+ *
+ * The scan always gets a new qual as a consequence of calling here (except
+ * when we determine that the top-level scan has run out of matching tuples).
+ * All later _bt_check_compare calls also use the same new qual that was first
+ * used here (at least until the next call here advances the keys once again).
+ * It's convenient to structure _bt_check_compare rechecks of caller's tuple
+ * (using the new qual) as one the steps of advancing the scan's array keys,
+ * so this function works as a wrapper around _bt_check_compare.
+ *
+ * Like _bt_check_compare, we'll set pstate.continuescan on behalf of the
+ * caller, and return a boolean indicating if caller's tuple satisfies the
+ * scan's new qual. But unlike _bt_check_compare, we set so->needPrimScan
+ * when we set continuescan=false, indicating if a new primitive index scan
+ * has been scheduled (otherwise, the top-level scan has run out of tuples in
+ * the current scan direction).
+ *
+ * Caller must use _bt_tuple_before_array_skeys to determine if the current
+ * place in the scan is >= the current array keys _before_ calling here.
+ * We're responsible for ensuring that caller's tuple is <= the newly advanced
+ * required array keys once we return. We try to find an exact match, but
+ * failing that we'll advance the array keys to whatever set of array elements
+ * comes next in the key space for the current scan direction. Required array
+ * keys "ratchet forwards" (or backwards). They can only advance as the scan
+ * itself advances through the index/key space.
+ *
+ * (The rules are the same for backwards scans, except that the operators are
+ * flipped: just replace the precondition's >= operator with a <=, and the
+ * postcondition's <= operator with with a >=. In other words, just swap the
+ * precondition with the postcondition.)
+ *
+ * We also deal with "advancing" non-required arrays here. Callers whose
+ * sktrig scan key is non-required specify sktrig_required=false. These calls
+ * are the only exception to the general rule about always advancing the
+ * required array keys (the scan may not even have a required array). These
+ * callers should just pass a NULL pstate (since there is never any question
+ * of stopping the scan). No call to _bt_tuple_before_array_skeys is required
+ * ahead of these calls (it's already clear that any required scan keys must
+ * be satisfied by caller's tuple).
+ *
+ * Note that we deal with non-array required equality strategy scan keys as
+ * degenerate single element arrays here. Obviously, they can never really
+ * advance in the way that real arrays can, but they must still affect how we
+ * advance real array scan keys (exactly like true array equality scan keys).
+ * We have to keep around a 3-way ORDER proc for these (using the "=" operator
+ * won't do), since in general whether the tuple is < or > _any_ unsatisfied
+ * required equality key influences how the scan's real arrays must advance.
+ *
+ * Note also that we may sometimes need to advance the array keys when the
+ * existing required array keys (and other required equality keys) are already
+ * an exact match for every corresponding value from caller's tuple. We must
+ * do this for inequalities that _bt_check_compare set continuescan=false for.
+ * They'll advance the array keys here, just like any other scan key that
+ * _bt_check_compare stops on. (This can even happen _after_ we advance the
+ * array keys, in which case we'll advance the array keys a second time. That
+ * way _bt_checkkeys caller always has its required arrays advance to the
+ * maximum possible extent that its tuple will allow.)
+ */
+static bool
+_bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
+ IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+ int sktrig, bool sktrig_required)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Relation rel = scan->indexRelation;
+ ScanDirection dir = pstate ? pstate->dir : ForwardScanDirection;
+ int arrayidx = 0;
+ bool beyond_end_advance = false,
+ has_required_opposite_direction_only = false,
+ oppodir_inequality_sktrig = false,
+ all_required_satisfied = true,
+ all_satisfied = true;
+
+ if (sktrig_required)
+ {
+ /*
+ * Precondition array state assertion
+ */
+ Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
+ tupnatts, false, 0, NULL));
+
+ so->scanBehind = false; /* reset */
+
+ /*
+ * Required scan key wasn't satisfied, so required arrays will have to
+ * advance. Invalidate page-level state that tracks whether the
+ * scan's required-in-opposite-direction-only keys are known to be
+ * satisfied by page's remaining tuples.
+ */
+ pstate->firstmatch = false;
+
+ /* Shouldn't have to invalidate 'prechecked', though */
+ Assert(!pstate->prechecked);
+
+ /*
+ * Once we return we'll have a new set of required array keys, so
+ * reset state used by "look ahead" optimization
+ */
+ pstate->rechecks = 0;
+ pstate->targetdistance = 0;
+ }
+
+ Assert(_bt_verify_keys_with_arraykeys(scan));
+
+ for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
+ {
+ ScanKey cur = so->keyData + ikey;
+ BTArrayKeyInfo *array = NULL;
+ Datum tupdatum;
+ bool required = false,
+ required_opposite_direction_only = false,
+ tupnull;
+ int32 result;
+ int set_elem = 0;
+
+ if (cur->sk_strategy == BTEqualStrategyNumber)
+ {
+ /* Manage array state */
+ if (cur->sk_flags & SK_SEARCHARRAY)
+ {
+ array = &so->arrayKeys[arrayidx++];
+ Assert(array->scan_key == ikey);
+ }
+ }
+ else
+ {
+ /*
+ * Are any inequalities required in the opposite direction only
+ * present here?
+ */
+ if (((ScanDirectionIsForward(dir) &&
+ (cur->sk_flags & (SK_BT_REQBKWD))) ||
+ (ScanDirectionIsBackward(dir) &&
+ (cur->sk_flags & (SK_BT_REQFWD)))))
+ has_required_opposite_direction_only =
+ required_opposite_direction_only = true;
+ }
+
+ /* Optimization: skip over known-satisfied scan keys */
+ if (ikey < sktrig)
+ continue;
+
+ if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))
+ {
+ Assert(sktrig_required);
+
+ required = true;
+
+ if (cur->sk_attno > tupnatts)
+ {
+ /* Set this just like _bt_tuple_before_array_skeys */
+ Assert(sktrig < ikey);
+ so->scanBehind = true;
+ }
+ }
+
+ /*
+ * Handle a required non-array scan key that the initial call to
+ * _bt_check_compare indicated triggered array advancement, if any.
+ *
+ * The non-array scan key's strategy will be <, <=, or = during a
+ * forwards scan (or any one of =, >=, or > during a backwards scan).
+ * It follows that the corresponding tuple attribute's value must now
+ * be either > or >= the scan key value (for backwards scans it must
+ * be either < or <= that value).
+ *
+ * If this is a required equality strategy scan key, this is just an
+ * optimization; _bt_tuple_before_array_skeys already confirmed that
+ * this scan key places us ahead of caller's tuple. There's no need
+ * to repeat that work now. (The same underlying principle also gets
+ * applied by the cur_elem_trig optimization used to speed up searches
+ * for the next array element.)
+ *
+ * If this is a required inequality strategy scan key, we _must_ rely
+ * on _bt_check_compare like this; we aren't capable of directly
+ * evaluating required inequality strategy scan keys here, on our own.
+ */
+ if (ikey == sktrig && !array)
+ {
+ Assert(sktrig_required && required && all_required_satisfied);
+
+ /* Use "beyond end" advancement. See below for an explanation. */
+ beyond_end_advance = true;
+ all_satisfied = all_required_satisfied = false;
+
+ /*
+ * Set a flag that remembers that this was an inequality required
+ * in the opposite scan direction only, that nevertheless
+ * triggered the call here.
+ *
+ * This only happens when an inequality operator (which must be
+ * strict) encounters a group of NULLs that indicate the end of
+ * non-NULL values for tuples in the current scan direction.
+ */
+ if (unlikely(required_opposite_direction_only))
+ oppodir_inequality_sktrig = true;
+
+ continue;
+ }
+
+ /*
+ * Nothing more for us to do with an inequality strategy scan key that
+ * wasn't the one that _bt_check_compare stopped on, though.
+ *
+ * Note: if our later call to _bt_check_compare (to recheck caller's
+ * tuple) sets continuescan=false due to finding this same inequality
+ * unsatisfied (possible when it's required in the scan direction),
+ * we'll deal with it via a recursive "second pass" call.
+ */
+ else if (cur->sk_strategy != BTEqualStrategyNumber)
+ continue;
+
+ /*
+ * Nothing for us to do with an equality strategy scan key that isn't
+ * marked required, either -- unless it's a non-required array
+ */
+ else if (!required && !array)
+ continue;
+
+ /*
+ * Here we perform steps for all array scan keys after a required
+ * array scan key whose binary search triggered "beyond end of array
+ * element" array advancement due to encountering a tuple attribute
+ * value > the closest matching array key (or < for backwards scans).
+ */
+ if (beyond_end_advance)
+ {
+ int final_elem_dir;
+
+ if (ScanDirectionIsBackward(dir) || !array)
+ final_elem_dir = 0;
+ else
+ final_elem_dir = array->num_elems - 1;
+
+ if (array && array->cur_elem != final_elem_dir)
+ {
+ array->cur_elem = final_elem_dir;
+ cur->sk_argument = array->elem_values[final_elem_dir];
+ }
+
+ continue;
+ }
+
+ /*
+ * Here we perform steps for all array scan keys after a required
+ * array scan key whose tuple attribute was < the closest matching
+ * array key when we dealt with it (or > for backwards scans).
+ *
+ * This earlier required array key already puts us ahead of caller's
+ * tuple in the key space (for the current scan direction). We must
+ * make sure that subsequent lower-order array keys do not put us too
+ * far ahead (ahead of tuples that have yet to be seen by our caller).
+ * For example, when a tuple "(a, b) = (42, 5)" advances the array
+ * keys on "a" from 40 to 45, we must also set "b" to whatever the
+ * first array element for "b" is. It would be wrong to allow "b" to
+ * be set based on the tuple value.
+ *
+ * Perform the same steps with truncated high key attributes. You can
+ * think of this as a "binary search" for the element closest to the
+ * value -inf. Again, the arrays must never get ahead of the scan.
+ */
+ if (!all_required_satisfied || cur->sk_attno > tupnatts)
+ {
+ int first_elem_dir;
+
+ if (ScanDirectionIsForward(dir) || !array)
+ first_elem_dir = 0;
+ else
+ first_elem_dir = array->num_elems - 1;
+
+ if (array && array->cur_elem != first_elem_dir)
+ {
+ array->cur_elem = first_elem_dir;
+ cur->sk_argument = array->elem_values[first_elem_dir];
+ }
+
+ continue;
+ }
+
+ /*
+ * Search in scankey's array for the corresponding tuple attribute
+ * value from caller's tuple
+ */
+ tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull);
+
+ if (array)
+ {
+ bool cur_elem_trig = (sktrig_required && ikey == sktrig);
+
+ /*
+ * Binary search for closest match that's available from the array
+ */
+ set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey],
+ cur_elem_trig, dir,
+ tupdatum, tupnull, array, cur,
+ &result);
+
+ Assert(set_elem >= 0 && set_elem < array->num_elems);
+ }
+ else
+ {
+ Assert(sktrig_required && required);
+
+ /*
+ * This is a required non-array equality strategy scan key, which
+ * we'll treat as a degenerate single element array.
+ *
+ * This scan key's imaginary "array" can't really advance, but it
+ * can still roll over like any other array. (Actually, this is
+ * no different to real single value arrays, which never advance
+ * without rolling over -- they can never truly advance, either.)
+ */
+ result = _bt_compare_array_skey(&so->orderProcs[ikey],
+ tupdatum, tupnull,
+ cur->sk_argument, cur);
+ }
+
+ /*
+ * Consider "beyond end of array element" array advancement.
+ *
+ * When the tuple attribute value is > the closest matching array key
+ * (or < in the backwards scan case), we need to ratchet this array
+ * forward (backward) by one increment, so that caller's tuple ends up
+ * being < final array value instead (or > final array value instead).
+ * This process has to work for all of the arrays, not just this one:
+ * it must "carry" to higher-order arrays when the set_elem that we
+ * just found happens to be the final one for the scan's direction.
+ * Incrementing (decrementing) set_elem itself isn't good enough.
+ *
+ * Our approach is to provisionally use set_elem as if it was an exact
+ * match now, then set each later/less significant array to whatever
+ * its final element is. Once outside the loop we'll then "increment
+ * this array's set_elem" by calling _bt_advance_array_keys_increment.
+ * That way the process rolls over to higher order arrays as needed.
+ *
+ * Under this scheme any required arrays only ever ratchet forwards
+ * (or backwards), and always do so to the maximum possible extent
+ * that we can know will be safe without seeing the scan's next tuple.
+ * We don't need any special handling for required scan keys that lack
+ * a real array to advance, nor for redundant scan keys that couldn't
+ * be eliminated by _bt_preprocess_keys. It won't matter if some of
+ * our "true" array scan keys (or even all of them) are non-required.
+ */
+ if (required &&
+ ((ScanDirectionIsForward(dir) && result > 0) ||
+ (ScanDirectionIsBackward(dir) && result < 0)))
+ beyond_end_advance = true;
+
+ Assert(all_required_satisfied && all_satisfied);
+ if (result != 0)
+ {
+ /*
+ * Track whether caller's tuple satisfies our new post-advancement
+ * qual, for required scan keys, as well as for the entire set of
+ * interesting scan keys (all required scan keys plus non-required
+ * array scan keys are considered interesting.)
+ */
+ all_satisfied = false;
+ if (required)
+ all_required_satisfied = false;
+ else
+ {
+ /*
+ * There's no need to advance the arrays using the best
+ * available match for a non-required array. Give up now.
+ * (Though note that sktrig_required calls still have to do
+ * all the usual post-advancement steps, including the recheck
+ * call to _bt_check_compare.)
+ */
+ break;
+ }
+ }
+
+ /* Advance array keys, even when set_elem isn't an exact match */
+ if (array && array->cur_elem != set_elem)
+ {
+ array->cur_elem = set_elem;
+ cur->sk_argument = array->elem_values[set_elem];
}
}
/*
- * If we changed any keys, we must redo _bt_preprocess_keys. That might
- * sound like overkill, but in cases with multiple keys per index column
- * it seems necessary to do the full set of pushups.
- *
- * Also do this whenever the scan's set of array keys "wrapped around" at
- * the end of the last primitive index scan. There won't have been a call
- * to _bt_preprocess_keys from some other place following wrap around, so
- * we do it for ourselves.
+ * Advance the array keys incrementally whenever "beyond end of array
+ * element" array advancement happens, so that advancement will carry to
+ * higher-order arrays (might exhaust all the scan's arrays instead, which
+ * ends the top-level scan).
*/
- if (changed || !so->arraysStarted)
- {
- _bt_preprocess_keys(scan);
- /* The mark should have been set on a consistent set of keys... */
- Assert(so->qual_ok);
- }
-}
+ if (beyond_end_advance && !_bt_advance_array_keys_increment(scan, dir))
+ goto end_toplevel_scan;
+ Assert(_bt_verify_keys_with_arraykeys(scan));
+
+ /*
+ * Does tuple now satisfy our new qual? Recheck with _bt_check_compare.
+ *
+ * Calls triggered by an unsatisfied required scan key, whose tuple now
+ * satisfies all required scan keys, but not all nonrequired array keys,
+ * will still require a recheck call to _bt_check_compare. They'll still
+ * need its "second pass" handling of required inequality scan keys.
+ * (Might have missed a still-unsatisfied required inequality scan key
+ * that caller didn't detect as the sktrig scan key during its initial
+ * _bt_check_compare call that used the old/original qual.)
+ *
+ * Calls triggered by an unsatisfied nonrequired array scan key never need
+ * "second pass" handling of required inequalities (nor any other handling
+ * of any required scan key). All that matters is whether caller's tuple
+ * satisfies the new qual, so it's safe to just skip the _bt_check_compare
+ * recheck when we've already determined that it can only return 'false'.
+ */
+ if ((sktrig_required && all_required_satisfied) ||
+ (!sktrig_required && all_satisfied))
+ {
+ int nsktrig = sktrig + 1;
+ bool continuescan;
+
+ Assert(all_required_satisfied);
+
+ /* Recheck _bt_check_compare on behalf of caller */
+ if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
+ false, false, false,
+ &continuescan, &nsktrig) &&
+ !so->scanBehind)
+ {
+ /* This tuple satisfies the new qual */
+ Assert(all_satisfied && continuescan);
+
+ if (pstate)
+ pstate->continuescan = true;
+
+ return true;
+ }
+
+ /*
+ * Consider "second pass" handling of required inequalities.
+ *
+ * It's possible that our _bt_check_compare call indicated that the
+ * scan should end due to some unsatisfied inequality that wasn't
+ * initially recognized as such by us. Handle this by calling
+ * ourselves recursively, this time indicating that the trigger is the
+ * inequality that we missed first time around (and using a set of
+ * required array/equality keys that are now exact matches for tuple).
+ *
+ * We make a strong, general guarantee that every _bt_checkkeys call
+ * here will advance the array keys to the maximum possible extent
+ * that we can know to be safe based on caller's tuple alone. If we
+ * didn't perform this step, then that guarantee wouldn't quite hold.
+ */
+ if (unlikely(!continuescan))
+ {
+ bool satisfied PG_USED_FOR_ASSERTS_ONLY;
+
+ Assert(sktrig_required);
+ Assert(so->keyData[nsktrig].sk_strategy != BTEqualStrategyNumber);
+
+ /*
+ * The tuple must use "beyond end" advancement during the
+ * recursive call, so we cannot possibly end up back here when
+ * recursing. We'll consume a small, fixed amount of stack space.
+ */
+ Assert(!beyond_end_advance);
+
+ /* Advance the array keys a second time using same tuple */
+ satisfied = _bt_advance_array_keys(scan, pstate, tuple, tupnatts,
+ tupdesc, nsktrig, true);
+
+ /* This tuple doesn't satisfy the inequality */
+ Assert(!satisfied);
+ return false;
+ }
+
+ /*
+ * Some non-required scan key (from new qual) still not satisfied.
+ *
+ * All scan keys required in the current scan direction must still be
+ * satisfied, though, so we can trust all_required_satisfied below.
+ */
+ }
+
+ /*
+ * When we were called just to deal with "advancing" non-required arrays,
+ * this is as far as we can go (cannot stop the scan for these callers)
+ */
+ if (!sktrig_required)
+ {
+ /* Caller's tuple doesn't match any qual */
+ return false;
+ }
+
+ /*
+ * Postcondition array state assertion (for still-unsatisfied tuples).
+ *
+ * By here we have established that the scan's required arrays (scan must
+ * have at least one required array) advanced, without becoming exhausted.
+ *
+ * Caller's tuple is now < the newly advanced array keys (or > when this
+ * is a backwards scan), except in the case where we only got this far due
+ * to an unsatisfied non-required scan key. Verify that with an assert.
+ *
+ * Note: we don't just quit at this point when all required scan keys were
+ * found to be satisfied because we need to consider edge-cases involving
+ * scan keys required in the opposite direction only; those aren't tracked
+ * by all_required_satisfied. (Actually, oppodir_inequality_sktrig trigger
+ * scan keys are tracked by all_required_satisfied, since it's convenient
+ * for _bt_check_compare to behave as if they are required in the current
+ * scan direction to deal with NULLs. We'll account for that separately.)
+ */
+ Assert(_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts,
+ false, 0, NULL) ==
+ !all_required_satisfied);
+
+ /*
+ * We generally permit primitive index scans to continue onto the next
+ * sibling page when the page's finaltup satisfies all required scan keys
+ * at the point where we're between pages.
+ *
+ * If caller's tuple is also the page's finaltup, and we see that required
+ * scan keys still aren't satisfied, start a new primitive index scan.
+ */
+ if (!all_required_satisfied && pstate->finaltup == tuple)
+ goto new_prim_scan;
+
+ /*
+ * Proactively check finaltup (don't wait until finaltup is reached by the
+ * scan) when it might well turn out to not be satisfied later on.
+ *
+ * Note: if so->scanBehind hasn't already been set for finaltup by us,
+ * it'll be set during this call to _bt_tuple_before_array_skeys. Either
+ * way, it'll be set correctly (for the whole page) after this point.
+ */
+ if (!all_required_satisfied && pstate->finaltup &&
+ _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc,
+ BTreeTupleGetNAtts(pstate->finaltup, rel),
+ false, 0, &so->scanBehind))
+ goto new_prim_scan;
+
+ /*
+ * When we encounter a truncated finaltup high key attribute, we're
+ * optimistic about the chances of its corresponding required scan key
+ * being satisfied when we go on to check it against tuples from this
+ * page's right sibling leaf page. We consider truncated attributes to be
+ * satisfied by required scan keys, which allows the primitive index scan
+ * to continue to the next leaf page. We must set so->scanBehind to true
+ * to remember that the last page's finaltup had "satisfied" required scan
+ * keys for one or more truncated attribute values (scan keys required in
+ * _either_ scan direction).
+ *
+ * There is a chance that _bt_checkkeys (which checks so->scanBehind) will
+ * find that even the sibling leaf page's finaltup is < the new array
+ * keys. When that happens, our optimistic policy will have incurred a
+ * single extra leaf page access that could have been avoided.
+ *
+ * A pessimistic policy would give backward scans a gratuitous advantage
+ * over forward scans. We'd punish forward scans for applying more
+ * accurate information from the high key, rather than just using the
+ * final non-pivot tuple as finaltup, in the style of backward scans.
+ * Being pessimistic would also give some scans with non-required arrays a
+ * perverse advantage over similar scans that use required arrays instead.
+ *
+ * You can think of this as a speculative bet on what the scan is likely
+ * to find on the next page. It's not much of a gamble, though, since the
+ * untruncated prefix of attributes must strictly satisfy the new qual
+ * (though it's okay if any non-required scan keys fail to be satisfied).
+ */
+ if (so->scanBehind && has_required_opposite_direction_only)
+ {
+ /*
+ * However, we avoid this behavior whenever the scan involves a scan
+ * key required in the opposite direction to the scan only, along with
+ * a finaltup with at least one truncated attribute that's associated
+ * with a scan key marked required (required in either direction).
+ *
+ * _bt_check_compare simply won't stop the scan for a scan key that's
+ * marked required in the opposite scan direction only. That leaves
+ * us without any reliable way of reconsidering any opposite-direction
+ * inequalities if it turns out that starting a new primitive index
+ * scan will allow _bt_first to skip ahead by a great many leaf pages
+ * (see next section for details of how that works).
+ */
+ goto new_prim_scan;
+ }
+
+ /*
+ * Handle inequalities marked required in the opposite scan direction.
+ * They can also signal that we should start a new primitive index scan.
+ *
+ * It's possible that the scan is now positioned where "matching" tuples
+ * begin, and that caller's tuple satisfies all scan keys required in the
+ * current scan direction. But if caller's tuple still doesn't satisfy
+ * other scan keys that are required in the opposite scan direction only
+ * (e.g., a required >= strategy scan key when scan direction is forward),
+ * it's still possible that there are many leaf pages before the page that
+ * _bt_first could skip straight to. Groveling through all those pages
+ * will always give correct answers, but it can be very inefficient. We
+ * must avoid needlessly scanning extra pages.
+ *
+ * Separately, it's possible that _bt_check_compare set continuescan=false
+ * for a scan key that's required in the opposite direction only. This is
+ * a special case, that happens only when _bt_check_compare sees that the
+ * inequality encountered a NULL value. This signals the end of non-NULL
+ * values in the current scan direction, which is reason enough to end the
+ * (primitive) scan. If this happens at the start of a large group of
+ * NULL values, then we shouldn't expect to be called again until after
+ * the scan has already read indefinitely-many leaf pages full of tuples
+ * with NULL suffix values. We need a separate test for this case so that
+ * we don't miss our only opportunity to skip over such a group of pages.
+ * (_bt_first is expected to skip over the group of NULLs by applying a
+ * similar "deduce NOT NULL" rule, where it finishes its insertion scan
+ * key by consing up an explicit SK_SEARCHNOTNULL key.)
+ *
+ * Apply a test against finaltup to detect and recover from these problem:
+ * if even finaltup doesn't satisfy such an inequality, we just skip by
+ * starting a new primitive index scan. When we skip, we know for sure
+ * that all of the tuples on the current page following caller's tuple are
+ * also before the _bt_first-wise start of tuples for our new qual. That
+ * at least suggests many more skippable pages beyond the current page.
+ */
+ if (has_required_opposite_direction_only && pstate->finaltup &&
+ (all_required_satisfied || oppodir_inequality_sktrig))
+ {
+ int nfinaltupatts = BTreeTupleGetNAtts(pstate->finaltup, rel);
+ ScanDirection flipped;
+ bool continuescanflip;
+ int opsktrig;
+
+ /*
+ * We're checking finaltup (which is usually not caller's tuple), so
+ * cannot reuse work from caller's earlier _bt_check_compare call.
+ *
+ * Flip the scan direction when calling _bt_check_compare this time,
+ * so that it will set continuescanflip=false when it encounters an
+ * inequality required in the opposite scan direction.
+ */
+ Assert(!so->scanBehind);
+ opsktrig = 0;
+ flipped = -dir;
+ _bt_check_compare(scan, flipped,
+ pstate->finaltup, nfinaltupatts, tupdesc,
+ false, false, false,
+ &continuescanflip, &opsktrig);
+
+ /*
+ * If we ended up here due to the all_required_satisfied criteria,
+ * test opsktrig in a way that ensures that finaltup contains the same
+ * prefix of key columns as caller's tuple (a prefix that satisfies
+ * earlier required-in-current-direction scan keys).
+ *
+ * If we ended up here due to the oppodir_inequality_sktrig criteria,
+ * test opsktrig in a way that ensures that the same scan key that our
+ * caller found to be unsatisfied (by the scan's tuple) was also the
+ * one unsatisfied just now (by finaltup). That way we'll only start
+ * a new primitive scan when we're sure that both tuples _don't_ share
+ * the same prefix of satisfied equality-constrained attribute values,
+ * and that finaltup has a non-NULL attribute value indicated by the
+ * unsatisfied scan key at offset opsktrig/sktrig. (This depends on
+ * _bt_check_compare not caring about the direction that inequalities
+ * are required in whenever NULL attribute values are unsatisfied. It
+ * only cares about the scan direction, and its relationship to
+ * whether NULLs are stored first or last relative to non-NULLs.)
+ */
+ Assert(all_required_satisfied != oppodir_inequality_sktrig);
+ if (unlikely(!continuescanflip &&
+ ((all_required_satisfied && opsktrig > sktrig) ||
+ (oppodir_inequality_sktrig && opsktrig >= sktrig))))
+ {
+ Assert(so->keyData[opsktrig].sk_strategy != BTEqualStrategyNumber);
+
+ /*
+ * Make sure that any non-required arrays are set to the first
+ * array element for the current scan direction
+ */
+ _bt_rewind_nonrequired_arrays(scan, dir);
+
+ goto new_prim_scan;
+ }
+ }
+
+ /*
+ * Stick with the ongoing primitive index scan for now.
+ *
+ * It's possible that later tuples will also turn out to have values that
+ * are still < the now-current array keys (or > the current array keys).
+ * Our caller will handle this by performing what amounts to a linear
+ * search of the page, implemented by calling _bt_check_compare and then
+ * _bt_tuple_before_array_skeys for each tuple.
+ *
+ * This approach has various advantages over a binary search of the page.
+ * Repeated binary searches of the page (one binary search for every array
+ * advancement) won't outperform a continuous linear search. While there
+ * are workloads that a naive linear search won't handle well, our caller
+ * has a "look ahead" fallback mechanism to deal with that problem.
+ */
+ pstate->continuescan = true; /* Override _bt_check_compare */
+ so->needPrimScan = false; /* _bt_readpage has more tuples to check */
+
+ if (so->scanBehind)
+ {
+ /* Optimization: skip by setting "look ahead" mechanism's offnum */
+ Assert(ScanDirectionIsForward(dir));
+ pstate->skip = pstate->maxoff + 1;
+ }
+
+ /* Caller's tuple doesn't match the new qual */
+ return false;
+
+new_prim_scan:
+
+ /*
+ * End this primitive index scan, but schedule another.
+ *
+ * Note: If the scan direction happens to change, this scheduled primitive
+ * index scan won't go ahead after all.
+ */
+ pstate->continuescan = false; /* Tell _bt_readpage we're done... */
+ so->needPrimScan = true; /* ...but call _bt_first again */
+
+ if (scan->parallel_scan)
+ _bt_parallel_primscan_schedule(scan, pstate->prev_scan_page);
+
+ /* Caller's tuple doesn't match the new qual */
+ return false;
+
+end_toplevel_scan:
+
+ /*
+ * End the current primitive index scan, but don't schedule another.
+ *
+ * This ends the entire top-level scan in the current scan direction.
+ *
+ * Note: The scan's arrays (including any non-required arrays) are now in
+ * their final positions for the current scan direction. If the scan
+ * direction happens to change, then the arrays will already be in their
+ * first positions for what will then be the current scan direction.
+ */
+ pstate->continuescan = false; /* Tell _bt_readpage we're done... */
+ so->needPrimScan = false; /* ...don't call _bt_first again, though */
+
+ /* Caller's tuple doesn't match any qual */
+ return false;
+}
/*
* _bt_preprocess_keys() -- Preprocess scan keys
*
- * The given search-type keys (in scan->keyData[] or so->arrayKeyData[])
+ * The given search-type keys (taken from scan->keyData[])
* are copied to so->keyData[] with possible transformation.
* scan->numberOfKeys is the number of input keys, so->numberOfKeys gets
* the number of output keys (possibly less, never greater).
@@ -690,8 +2485,9 @@ _bt_restore_array_keys(IndexScanDesc scan)
* The output keys must be sorted by index attribute. Presently we expect
* (but verify) that the input keys are already so sorted --- this is done
* by match_clauses_to_index() in indxpath.c. Some reordering of the keys
- * within each attribute may be done as a byproduct of the processing here,
- * but no other code depends on that.
+ * within each attribute may be done as a byproduct of the processing here.
+ * That process must leave array scan keys (within an attribute) in the same
+ * order as corresponding entries from the scan's BTArrayKeyInfo array info.
*
* The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
* if they must be satisfied in order to continue the scan forward or backward
@@ -748,8 +2544,8 @@ _bt_restore_array_keys(IndexScanDesc scan)
*
* Note: the reason we have to copy the preprocessed scan keys into private
* storage is that we are modifying the array based on comparisons of the
- * key argument values, which could change on a rescan or after moving to
- * new elements of array keys. Therefore we can't overwrite the source data.
+ * key argument values, which could change on a rescan. Therefore we can't
+ * overwrite the source data.
*/
void
_bt_preprocess_keys(IndexScanDesc scan)
@@ -762,11 +2558,31 @@ _bt_preprocess_keys(IndexScanDesc scan)
ScanKey inkeys;
ScanKey outkeys;
ScanKey cur;
- ScanKey xform[BTMaxStrategyNumber];
+ BTScanKeyPreproc xform[BTMaxStrategyNumber];
bool test_result;
int i,
j;
AttrNumber attno;
+ ScanKey arrayKeyData;
+ int *keyDataMap = NULL;
+ int arrayidx = 0;
+
+ /*
+ * We're called at the start of each primitive index scan during scans
+ * that use equality array keys. We can just reuse the scan keys that
+ * were output at the start of the scan's first primitive index scan.
+ */
+ if (so->numberOfKeys > 0)
+ {
+ /*
+ * An earlier call to _bt_advance_array_keys already set everything up
+ * already. Just assert that the scan's existing output scan keys are
+ * consistent with its current array elements.
+ */
+ Assert(so->numArrayKeys);
+ Assert(_bt_verify_keys_with_arraykeys(scan));
+ return;
+ }
/* initialize result variables */
so->qual_ok = true;
@@ -775,11 +2591,27 @@ _bt_preprocess_keys(IndexScanDesc scan)
if (numberOfKeys < 1)
return; /* done if qual-less scan */
+ /* If any keys are SK_SEARCHARRAY type, set up array-key info */
+ arrayKeyData = _bt_preprocess_array_keys(scan);
+ if (!so->qual_ok)
+ {
+ /* unmatchable array, so give up */
+ return;
+ }
+
/*
- * Read so->arrayKeyData if array keys are present, else scan->keyData
+ * Treat arrayKeyData[] (a partially preprocessed copy of scan->keyData[])
+ * as our input if _bt_preprocess_array_keys just allocated it, else just
+ * use scan->keyData[]
*/
- if (so->arrayKeyData != NULL)
- inkeys = so->arrayKeyData;
+ if (arrayKeyData)
+ {
+ inkeys = arrayKeyData;
+
+ /* Also maintain keyDataMap for remapping so->orderProc[] later */
+ keyDataMap = MemoryContextAlloc(so->arrayContext,
+ numberOfKeys * sizeof(int));
+ }
else
inkeys = scan->keyData;
@@ -800,6 +2632,19 @@ _bt_preprocess_keys(IndexScanDesc scan)
/* We can mark the qual as required if it's for first index col */
if (cur->sk_attno == 1)
_bt_mark_scankey_required(outkeys);
+ if (arrayKeyData)
+ {
+ /*
+ * Don't call _bt_preprocess_array_keys_final in this fast path
+ * (we'll miss out on the single value array transformation, but
+ * that's not nearly as important when there's only one scan key)
+ */
+ Assert(cur->sk_flags & SK_SEARCHARRAY);
+ Assert(cur->sk_strategy != BTEqualStrategyNumber ||
+ (so->arrayKeys[0].scan_key == 0 &&
+ OidIsValid(so->orderProcs[0].fn_oid)));
+ }
+
return;
}
@@ -859,13 +2704,29 @@ _bt_preprocess_keys(IndexScanDesc scan)
* check, and we've rejected any combination of it with a regular
* equality condition; but not with other types of conditions.
*/
- if (xform[BTEqualStrategyNumber - 1])
+ if (xform[BTEqualStrategyNumber - 1].skey)
{
- ScanKey eq = xform[BTEqualStrategyNumber - 1];
+ ScanKey eq = xform[BTEqualStrategyNumber - 1].skey;
+ BTArrayKeyInfo *array = NULL;
+ FmgrInfo *orderproc = NULL;
+
+ if (arrayKeyData && (eq->sk_flags & SK_SEARCHARRAY))
+ {
+ int eq_in_ikey,
+ eq_arrayidx;
+
+ eq_in_ikey = xform[BTEqualStrategyNumber - 1].ikey;
+ eq_arrayidx = xform[BTEqualStrategyNumber - 1].arrayidx;
+ array = &so->arrayKeys[eq_arrayidx - 1];
+ orderproc = so->orderProcs + eq_in_ikey;
+
+ Assert(array->scan_key == eq_in_ikey);
+ Assert(OidIsValid(orderproc->fn_oid));
+ }
for (j = BTMaxStrategyNumber; --j >= 0;)
{
- ScanKey chk = xform[j];
+ ScanKey chk = xform[j].skey;
if (!chk || j == (BTEqualStrategyNumber - 1))
continue;
@@ -878,6 +2739,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
}
if (_bt_compare_scankey_args(scan, chk, eq, chk,
+ array, orderproc,
&test_result))
{
if (!test_result)
@@ -887,7 +2749,9 @@ _bt_preprocess_keys(IndexScanDesc scan)
return;
}
/* else discard the redundant non-equality key */
- xform[j] = NULL;
+ Assert(!array || array->num_elems > 0);
+ xform[j].skey = NULL;
+ xform[j].ikey = -1;
}
/* else, cannot determine redundancy, keep both keys */
}
@@ -896,36 +2760,36 @@ _bt_preprocess_keys(IndexScanDesc scan)
}
/* try to keep only one of <, <= */
- if (xform[BTLessStrategyNumber - 1]
- && xform[BTLessEqualStrategyNumber - 1])
+ if (xform[BTLessStrategyNumber - 1].skey
+ && xform[BTLessEqualStrategyNumber - 1].skey)
{
- ScanKey lt = xform[BTLessStrategyNumber - 1];
- ScanKey le = xform[BTLessEqualStrategyNumber - 1];
+ ScanKey lt = xform[BTLessStrategyNumber - 1].skey;
+ ScanKey le = xform[BTLessEqualStrategyNumber - 1].skey;
- if (_bt_compare_scankey_args(scan, le, lt, le,
+ if (_bt_compare_scankey_args(scan, le, lt, le, NULL, NULL,
&test_result))
{
if (test_result)
- xform[BTLessEqualStrategyNumber - 1] = NULL;
+ xform[BTLessEqualStrategyNumber - 1].skey = NULL;
else
- xform[BTLessStrategyNumber - 1] = NULL;
+ xform[BTLessStrategyNumber - 1].skey = NULL;
}
}
/* try to keep only one of >, >= */
- if (xform[BTGreaterStrategyNumber - 1]
- && xform[BTGreaterEqualStrategyNumber - 1])
+ if (xform[BTGreaterStrategyNumber - 1].skey
+ && xform[BTGreaterEqualStrategyNumber - 1].skey)
{
- ScanKey gt = xform[BTGreaterStrategyNumber - 1];
- ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1];
+ ScanKey gt = xform[BTGreaterStrategyNumber - 1].skey;
+ ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1].skey;
- if (_bt_compare_scankey_args(scan, ge, gt, ge,
+ if (_bt_compare_scankey_args(scan, ge, gt, ge, NULL, NULL,
&test_result))
{
if (test_result)
- xform[BTGreaterEqualStrategyNumber - 1] = NULL;
+ xform[BTGreaterEqualStrategyNumber - 1].skey = NULL;
else
- xform[BTGreaterStrategyNumber - 1] = NULL;
+ xform[BTGreaterStrategyNumber - 1].skey = NULL;
}
}
@@ -936,11 +2800,13 @@ _bt_preprocess_keys(IndexScanDesc scan)
*/
for (j = BTMaxStrategyNumber; --j >= 0;)
{
- if (xform[j])
+ if (xform[j].skey)
{
ScanKey outkey = &outkeys[new_numberOfKeys++];
- memcpy(outkey, xform[j], sizeof(ScanKeyData));
+ memcpy(outkey, xform[j].skey, sizeof(ScanKeyData));
+ if (arrayKeyData)
+ keyDataMap[new_numberOfKeys - 1] = xform[j].ikey;
if (priorNumberOfEqualCols == attno - 1)
_bt_mark_scankey_required(outkey);
}
@@ -966,6 +2832,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
ScanKey outkey = &outkeys[new_numberOfKeys++];
memcpy(outkey, cur, sizeof(ScanKeyData));
+ if (arrayKeyData)
+ keyDataMap[new_numberOfKeys - 1] = i;
if (numberOfEqualCols == attno - 1)
_bt_mark_scankey_required(outkey);
@@ -977,20 +2845,112 @@ _bt_preprocess_keys(IndexScanDesc scan)
continue;
}
- /* have we seen one of these before? */
- if (xform[j] == NULL)
+ /*
+ * Does this input scan key require further processing as an array?
+ */
+ if (cur->sk_strategy == InvalidStrategy)
{
- /* nope, so remember this scankey */
- xform[j] = cur;
+ /* _bt_preprocess_array_keys marked this array key redundant */
+ Assert(arrayKeyData);
+ Assert(cur->sk_flags & SK_SEARCHARRAY);
+ continue;
+ }
+
+ if (cur->sk_strategy == BTEqualStrategyNumber &&
+ (cur->sk_flags & SK_SEARCHARRAY))
+ {
+ /* _bt_preprocess_array_keys kept this array key */
+ Assert(arrayKeyData);
+ arrayidx++;
+ }
+
+ /*
+ * have we seen a scan key for this same attribute and using this same
+ * operator strategy before now?
+ */
+ if (xform[j].skey == NULL)
+ {
+ /* nope, so this scan key wins by default (at least for now) */
+ xform[j].skey = cur;
+ xform[j].ikey = i;
+ xform[j].arrayidx = arrayidx;
}
else
{
- /* yup, keep only the more restrictive key */
- if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
- &test_result))
+ FmgrInfo *orderproc = NULL;
+ BTArrayKeyInfo *array = NULL;
+
+ /*
+ * Seen one of these before, so keep only the more restrictive key
+ * if possible
+ */
+ if (j == (BTEqualStrategyNumber - 1) && arrayKeyData)
{
+ /*
+ * Have to set up array keys
+ */
+ if ((cur->sk_flags & SK_SEARCHARRAY))
+ {
+ array = &so->arrayKeys[arrayidx - 1];
+ orderproc = so->orderProcs + i;
+
+ Assert(array->scan_key == i);
+ Assert(OidIsValid(orderproc->fn_oid));
+ }
+ else if ((xform[j].skey->sk_flags & SK_SEARCHARRAY))
+ {
+ array = &so->arrayKeys[xform[j].arrayidx - 1];
+ orderproc = so->orderProcs + xform[j].ikey;
+
+ Assert(array->scan_key == xform[j].ikey);
+ Assert(OidIsValid(orderproc->fn_oid));
+ }
+
+ /*
+ * Both scan keys might have arrays, in which case we'll
+ * arbitrarily pass only one of the arrays. That won't
+ * matter, since _bt_compare_scankey_args is aware that two
+ * SEARCHARRAY scan keys mean that _bt_preprocess_array_keys
+ * failed to eliminate redundant arrays through array merging.
+ * _bt_compare_scankey_args just returns false when it sees
+ * this; it won't even try to examine either array.
+ */
+ }
+
+ if (_bt_compare_scankey_args(scan, cur, cur, xform[j].skey,
+ array, orderproc, &test_result))
+ {
+ /* Have all we need to determine redundancy */
if (test_result)
- xform[j] = cur;
+ {
+ Assert(!array || array->num_elems > 0);
+
+ /*
+ * New key is more restrictive, and so replaces old key...
+ */
+ if (j != (BTEqualStrategyNumber - 1) ||
+ !(xform[j].skey->sk_flags & SK_SEARCHARRAY))
+ {
+ Assert(!array || array->scan_key == i);
+ xform[j].skey = cur;
+ xform[j].ikey = i;
+ xform[j].arrayidx = arrayidx;
+ }
+ else
+ {
+ /*
+ * ...unless we have to keep the old key because it's
+ * an array that rendered the new key redundant. We
+ * need to make sure that we don't throw away an array
+ * scan key. _bt_compare_scankey_args expects us to
+ * always keep arrays (and discard non-arrays).
+ */
+ Assert(j == (BTEqualStrategyNumber - 1));
+ Assert(xform[j].skey->sk_flags & SK_SEARCHARRAY);
+ Assert(xform[j].ikey == array->scan_key);
+ Assert(!(cur->sk_flags & SK_SEARCHARRAY));
+ }
+ }
else if (j == (BTEqualStrategyNumber - 1))
{
/* key == a && key == b, but a != b */
@@ -1002,22 +2962,130 @@ _bt_preprocess_keys(IndexScanDesc scan)
else
{
/*
- * We can't determine which key is more restrictive. Keep the
- * previous one in xform[j] and push this one directly to the
- * output array.
+ * We can't determine which key is more restrictive. Push
+ * xform[j] directly to the output array, then set xform[j] to
+ * the new scan key.
+ *
+ * Note: We do things this way around so that our arrays are
+ * always in the same order as their corresponding scan keys,
+ * even with incomplete opfamilies. _bt_advance_array_keys
+ * depends on this.
*/
ScanKey outkey = &outkeys[new_numberOfKeys++];
- memcpy(outkey, cur, sizeof(ScanKeyData));
+ memcpy(outkey, xform[j].skey, sizeof(ScanKeyData));
+ if (arrayKeyData)
+ keyDataMap[new_numberOfKeys - 1] = xform[j].ikey;
if (numberOfEqualCols == attno - 1)
_bt_mark_scankey_required(outkey);
+ xform[j].skey = cur;
+ xform[j].ikey = i;
+ xform[j].arrayidx = arrayidx;
}
}
}
so->numberOfKeys = new_numberOfKeys;
+
+ /*
+ * Now that we've built a temporary mapping from so->keyData[] (output
+ * scan keys) to scan->keyData[] (input scan keys), fix array->scan_key
+ * references. Also consolidate the so->orderProc[] array such that it
+ * can be subscripted using so->keyData[]-wise offsets.
+ */
+ if (arrayKeyData)
+ _bt_preprocess_array_keys_final(scan, keyDataMap);
+
+ /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */
}
+#ifdef USE_ASSERT_CHECKING
+/*
+ * Verify that the scan's qual state matches what we expect at the point that
+ * _bt_start_prim_scan is about to start a just-scheduled new primitive scan.
+ *
+ * We enforce a rule against non-required array scan keys: they must start out
+ * with whatever element is the first for the scan's current scan direction.
+ * See _bt_rewind_nonrequired_arrays comments for an explanation.
+ */
+static bool
+_bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ int arrayidx = 0;
+
+ for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
+ {
+ ScanKey cur = so->keyData + ikey;
+ BTArrayKeyInfo *array = NULL;
+ int first_elem_dir;
+
+ if (!(cur->sk_flags & SK_SEARCHARRAY) ||
+ cur->sk_strategy != BTEqualStrategyNumber)
+ continue;
+
+ array = &so->arrayKeys[arrayidx++];
+
+ if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
+ ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
+ continue;
+
+ if (ScanDirectionIsForward(dir))
+ first_elem_dir = 0;
+ else
+ first_elem_dir = array->num_elems - 1;
+
+ if (array->cur_elem != first_elem_dir)
+ return false;
+ }
+
+ return _bt_verify_keys_with_arraykeys(scan);
+}
+
+/*
+ * Verify that the scan's "so->keyData[]" scan keys are in agreement with
+ * its array key state
+ */
+static bool
+_bt_verify_keys_with_arraykeys(IndexScanDesc scan)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ int last_sk_attno = InvalidAttrNumber,
+ arrayidx = 0;
+
+ if (!so->qual_ok)
+ return false;
+
+ for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
+ {
+ ScanKey cur = so->keyData + ikey;
+ BTArrayKeyInfo *array;
+
+ if (cur->sk_strategy != BTEqualStrategyNumber ||
+ !(cur->sk_flags & SK_SEARCHARRAY))
+ continue;
+
+ array = &so->arrayKeys[arrayidx++];
+ if (array->scan_key != ikey)
+ return false;
+
+ if (array->num_elems <= 0)
+ return false;
+
+ if (cur->sk_argument != array->elem_values[array->cur_elem])
+ return false;
+ if (last_sk_attno > cur->sk_attno)
+ return false;
+ last_sk_attno = cur->sk_attno;
+ }
+
+ if (arrayidx != so->numArrayKeys)
+ return false;
+
+ return true;
+}
+#endif
+
/*
* Compare two scankey values using a specified operator.
*
@@ -1033,9 +3101,24 @@ _bt_preprocess_keys(IndexScanDesc scan)
* we store the operator result in *result and return true. We return false
* if the comparison could not be made.
*
+ * If either leftarg or rightarg are an array, we'll apply array-specific
+ * rules to determine which array elements are redundant on behalf of caller.
+ * It is up to our caller to save whichever of the two scan keys is the array,
+ * and discard the non-array scan key (the non-array scan key is guaranteed to
+ * be redundant with any complete opfamily). Caller isn't expected to call
+ * here with a pair of array scan keys provided we're dealing with a complete
+ * opfamily (_bt_preprocess_array_keys will merge array keys together to make
+ * sure of that).
+ *
+ * Note: we'll also shrink caller's array as needed to eliminate redundant
+ * array elements. One reason why caller should prefer to discard non-array
+ * scan keys is so that we'll have the opportunity to shrink the array
+ * multiple times, in multiple calls (for each of several other scan keys on
+ * the same index attribute).
+ *
* Note: op always points at the same ScanKey as either leftarg or rightarg.
- * Since we don't scribble on the scankeys, this aliasing should cause no
- * trouble.
+ * Since we don't scribble on the scankeys themselves, this aliasing should
+ * cause no trouble.
*
* Note: this routine needs to be insensitive to any DESC option applied
* to the index column. For example, "x < 4" is a tighter constraint than
@@ -1044,6 +3127,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
static bool
_bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
ScanKey leftarg, ScanKey rightarg,
+ BTArrayKeyInfo *array, FmgrInfo *orderproc,
bool *result)
{
Relation rel = scan->indexRelation;
@@ -1112,6 +3196,48 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
return true;
}
+ /*
+ * If either leftarg or rightarg are equality-type array scankeys, we need
+ * specialized handling (since by now we know that IS NULL wasn't used)
+ */
+ if (array)
+ {
+ bool leftarray,
+ rightarray;
+
+ leftarray = ((leftarg->sk_flags & SK_SEARCHARRAY) &&
+ leftarg->sk_strategy == BTEqualStrategyNumber);
+ rightarray = ((rightarg->sk_flags & SK_SEARCHARRAY) &&
+ rightarg->sk_strategy == BTEqualStrategyNumber);
+
+ /*
+ * _bt_preprocess_array_keys is responsible for merging together array
+ * scan keys, and will do so whenever the opfamily has the required
+ * cross-type support. If it failed to do that, we handle it just
+ * like the case where we can't make the comparison ourselves.
+ */
+ if (leftarray && rightarray)
+ {
+ /* Can't make the comparison */
+ *result = false; /* suppress compiler warnings */
+ return false;
+ }
+
+ /*
+ * Otherwise we need to determine if either one of leftarg or rightarg
+ * uses an array, then pass this through to a dedicated helper
+ * function.
+ */
+ if (leftarray)
+ return _bt_compare_array_scankey_args(scan, leftarg, rightarg,
+ orderproc, array, result);
+ else if (rightarray)
+ return _bt_compare_array_scankey_args(scan, rightarg, leftarg,
+ orderproc, array, result);
+
+ /* FALL THRU */
+ }
+
/*
* The opfamily we need to worry about is identified by the index column.
*/
@@ -1351,60 +3477,234 @@ _bt_mark_scankey_required(ScanKey skey)
*
* Return true if so, false if not. If the tuple fails to pass the qual,
* we also determine whether there's any need to continue the scan beyond
- * this tuple, and set *continuescan accordingly. See comments for
+ * this tuple, and set pstate.continuescan accordingly. See comments for
* _bt_preprocess_keys(), above, about how this is done.
*
* Forward scan callers can pass a high key tuple in the hopes of having
* us set *continuescan to false, and avoiding an unnecessary visit to
* the page to the right.
*
+ * Advances the scan's array keys when necessary for arrayKeys=true callers.
+ * Caller can avoid all array related side-effects when calling just to do a
+ * page continuescan precheck -- pass arrayKeys=false for that. Scans without
+ * any arrays keys must always pass arrayKeys=false.
+ *
+ * Also stops and starts primitive index scans for arrayKeys=true callers.
+ * Scans with array keys are required to set up page state that helps us with
+ * this. The page's finaltup tuple (the page high key for a forward scan, or
+ * the page's first non-pivot tuple for a backward scan) must be set in
+ * pstate.finaltup ahead of the first call here for the page (or possibly the
+ * first call after an initial continuescan-setting page precheck call). Set
+ * this to NULL for rightmost page (or the leftmost page for backwards scans).
+ *
* scan: index scan descriptor (containing a search-type scankey)
+ * pstate: page level input and output parameters
+ * arrayKeys: should we advance the scan's array keys if necessary?
* tuple: index tuple to test
* tupnatts: number of attributes in tupnatts (high key may be truncated)
- * dir: direction we are scanning in
- * continuescan: output parameter (will be set correctly in all cases)
- * continuescanPrechecked: indicates that *continuescan flag is known to
- * be true for the last item on the page
- * haveFirstMatch: indicates that we already have at least one match
- * in the current page
*/
bool
-_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
- ScanDirection dir, bool *continuescan,
- bool continuescanPrechecked, bool haveFirstMatch)
+_bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
+ IndexTuple tuple, int tupnatts)
{
- TupleDesc tupdesc;
- BTScanOpaque so;
- int keysz;
- int ikey;
- ScanKey key;
+ TupleDesc tupdesc = RelationGetDescr(scan->indexRelation);
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ ScanDirection dir = pstate->dir;
+ int ikey = 0;
+ bool res;
Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
- *continuescan = true; /* default assumption */
+ res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
+ arrayKeys, pstate->prechecked, pstate->firstmatch,
+ &pstate->continuescan, &ikey);
- tupdesc = RelationGetDescr(scan->indexRelation);
- so = (BTScanOpaque) scan->opaque;
- keysz = so->numberOfKeys;
-
- for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
+#ifdef USE_ASSERT_CHECKING
+ if (!arrayKeys && so->numArrayKeys)
{
- Datum datum;
- bool isNull;
- Datum test;
- bool requiredSameDir = false,
- requiredOppositeDir = false;
+ /*
+ * This is a continuescan precheck call for a scan with array keys.
+ *
+ * Assert that the scan isn't in danger of becoming confused.
+ */
+ Assert(!so->scanBehind && !pstate->prechecked && !pstate->firstmatch);
+ Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
+ tupnatts, false, 0, NULL));
+ }
+ if (pstate->prechecked || pstate->firstmatch)
+ {
+ bool dcontinuescan;
+ int dikey = 0;
/*
- * Check if the key is required for ordered scan in the same or
- * opposite direction. Save as flag variables for future usage.
+ * Call relied on continuescan/firstmatch prechecks -- assert that we
+ * get the same answer without those optimizations
+ */
+ Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
+ false, false, false,
+ &dcontinuescan, &dikey));
+ Assert(pstate->continuescan == dcontinuescan);
+ }
+#endif
+
+ /*
+ * Only one _bt_check_compare call is required in the common case where
+ * there are no equality strategy array scan keys. Otherwise we can only
+ * accept _bt_check_compare's answer unreservedly when it didn't set
+ * pstate.continuescan=false.
+ */
+ if (!arrayKeys || pstate->continuescan)
+ return res;
+
+ /*
+ * _bt_check_compare call set continuescan=false in the presence of
+ * equality type array keys. This could mean that the tuple is just past
+ * the end of matches for the current array keys.
+ *
+ * It's also possible that the scan is still _before_ the _start_ of
+ * tuples matching the current set of array keys. Check for that first.
+ */
+ if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true,
+ ikey, NULL))
+ {
+ /*
+ * Tuple is still before the start of matches according to the scan's
+ * required array keys (according to _all_ of its required equality
+ * strategy keys, actually).
+ *
+ * _bt_advance_array_keys occasionally sets so->scanBehind to signal
+ * that the scan's current position/tuples might be significantly
+ * behind (multiple pages behind) its current array keys. When this
+ * happens, we need to be prepared to recover by starting a new
+ * primitive index scan here, on our own.
+ */
+ Assert(!so->scanBehind ||
+ so->keyData[ikey].sk_strategy == BTEqualStrategyNumber);
+ if (unlikely(so->scanBehind) && pstate->finaltup &&
+ _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc,
+ BTreeTupleGetNAtts(pstate->finaltup,
+ scan->indexRelation),
+ false, 0, NULL))
+ {
+ /* Cut our losses -- start a new primitive index scan now */
+ pstate->continuescan = false;
+ so->needPrimScan = true;
+ }
+ else
+ {
+ /* Override _bt_check_compare, continue primitive scan */
+ pstate->continuescan = true;
+
+ /*
+ * We will end up here repeatedly given a group of tuples > the
+ * previous array keys and < the now-current keys (for a backwards
+ * scan it's just the same, though the operators swap positions).
+ *
+ * We must avoid allowing this linear search process to scan very
+ * many tuples from well before the start of tuples matching the
+ * current array keys (or from well before the point where we'll
+ * once again have to advance the scan's array keys).
+ *
+ * We keep the overhead under control by speculatively "looking
+ * ahead" to later still-unscanned items from this same leaf page.
+ * We'll only attempt this once the number of tuples that the
+ * linear search process has examined starts to get out of hand.
+ */
+ pstate->rechecks++;
+ if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS)
+ {
+ /* See if we should skip ahead within the current leaf page */
+ _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc);
+
+ /*
+ * Might have set pstate.skip to a later page offset. When
+ * that happens then _bt_readpage caller will inexpensively
+ * skip ahead to a later tuple from the same page (the one
+ * just after the tuple we successfully "looked ahead" to).
+ */
+ }
+ }
+
+ /* This indextuple doesn't match the current qual, in any case */
+ return false;
+ }
+
+ /*
+ * Caller's tuple is >= the current set of array keys and other equality
+ * constraint scan keys (or <= if this is a backwards scan). It's now
+ * clear that we _must_ advance any required array keys in lockstep with
+ * the scan.
+ */
+ return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc,
+ ikey, true);
+}
+
+/*
+ * Test whether an indextuple satisfies current scan condition.
+ *
+ * Return true if so, false if not. If not, also sets *continuescan to false
+ * when it's also not possible for any later tuples to pass the current qual
+ * (with the scan's current set of array keys, in the current scan direction),
+ * in addition to setting *ikey to the so->keyData[] subscript/offset for the
+ * unsatisfied scan key (needed when caller must consider advancing the scan's
+ * array keys).
+ *
+ * This is a subroutine for _bt_checkkeys. We provisionally assume that
+ * reaching the end of the current set of required keys (in particular the
+ * current required array keys) ends the ongoing (primitive) index scan.
+ * Callers without array keys should just end the scan right away when they
+ * find that continuescan has been set to false here by us. Things are more
+ * complicated for callers with array keys.
+ *
+ * Callers with array keys must first consider advancing the arrays when
+ * continuescan has been set to false here by us. They must then consider if
+ * it really does make sense to end the current (primitive) index scan, in
+ * light of everything that is known at that point. (In general when we set
+ * continuescan=false for these callers it must be treated as provisional.)
+ *
+ * We deal with advancing unsatisfied non-required arrays directly, though.
+ * This is safe, since by definition non-required keys can't end the scan.
+ * This is just how we determine if non-required arrays are just unsatisfied
+ * by the current array key, or if they're truly unsatisfied (that is, if
+ * they're unsatisfied by every possible array key).
+ *
+ * Though we advance non-required array keys on our own, that shouldn't have
+ * any lasting consequences for the scan. By definition, non-required arrays
+ * have no fixed relationship with the scan's progress. (There are delicate
+ * considerations for non-required arrays when the arrays need to be advanced
+ * following our setting continuescan to false, but that doesn't concern us.)
+ *
+ * Pass advancenonrequired=false to avoid all array related side effects.
+ * This allows _bt_advance_array_keys caller to avoid infinite recursion.
+ */
+static bool
+_bt_check_compare(IndexScanDesc scan, ScanDirection dir,
+ IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+ bool advancenonrequired, bool prechecked, bool firstmatch,
+ bool *continuescan, int *ikey)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+ *continuescan = true; /* default assumption */
+
+ for (; *ikey < so->numberOfKeys; (*ikey)++)
+ {
+ ScanKey key = so->keyData + *ikey;
+ Datum datum;
+ bool isNull;
+ bool requiredSameDir = false,
+ requiredOppositeDirOnly = false;
+
+ /*
+ * Check if the key is required in the current scan direction, in the
+ * opposite scan direction _only_, or in neither direction
*/
if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
requiredSameDir = true;
else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) ||
((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir)))
- requiredOppositeDir = true;
+ requiredOppositeDirOnly = true;
/*
* If the caller told us the *continuescan flag is known to be true
@@ -1422,8 +3722,9 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
* Both cases above work except for the row keys, where NULLs could be
* found in the middle of matching values.
*/
- if ((requiredSameDir || (requiredOppositeDir && haveFirstMatch)) &&
- !(key->sk_flags & SK_ROW_HEADER) && continuescanPrechecked)
+ if (prechecked &&
+ (requiredSameDir || (requiredOppositeDirOnly && firstmatch)) &&
+ !(key->sk_flags & SK_ROW_HEADER))
continue;
if (key->sk_attno > tupnatts)
@@ -1434,7 +3735,6 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
* right could be any possible value. Assume that truncated
* attribute passes the qual.
*/
- Assert(ScanDirectionIsForward(dir));
Assert(BTreeTupleIsPivot(tuple));
continue;
}
@@ -1495,6 +3795,8 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
* because it's not possible for any future tuples to pass. On
* a forward scan, however, we must keep going, because we may
* have initially positioned to the start of the index.
+ * (_bt_advance_array_keys also relies on this behavior during
+ * forward scans.)
*/
if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
ScanDirectionIsBackward(dir))
@@ -1511,6 +3813,8 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
* because it's not possible for any future tuples to pass. On
* a backward scan, however, we must keep going, because we
* may have initially positioned to the end of the index.
+ * (_bt_advance_array_keys also relies on this behavior during
+ * backward scans.)
*/
if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
ScanDirectionIsForward(dir))
@@ -1524,24 +3828,15 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
}
/*
- * Apply the key-checking function. When the key is required for the
- * opposite direction scan, it must be already satisfied as soon as
- * there is already match on the page. Except for the NULLs checking,
- * which have already done above.
+ * Apply the key-checking function, though only if we must.
+ *
+ * When a key is required in the opposite-of-scan direction _only_,
+ * then it must already be satisfied if firstmatch=true indicates that
+ * an earlier tuple from this same page satisfied it earlier on.
*/
- if (!(requiredOppositeDir && haveFirstMatch))
- {
- test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
- datum, key->sk_argument);
- }
- else
- {
- test = true;
- Assert(test == FunctionCall2Coll(&key->sk_func, key->sk_collation,
- datum, key->sk_argument));
- }
-
- if (!DatumGetBool(test))
+ if (!(requiredOppositeDirOnly && firstmatch) &&
+ !DatumGetBool(FunctionCall2Coll(&key->sk_func, key->sk_collation,
+ datum, key->sk_argument)))
{
/*
* Tuple fails this qual. If it's a required qual for the current
@@ -1557,7 +3852,19 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
*continuescan = false;
/*
- * In any case, this indextuple doesn't match the qual.
+ * If this is a non-required equality-type array key, the tuple
+ * needs to be checked against every possible array key. Handle
+ * this by "advancing" the scan key's array to a matching value
+ * (if we're successful then the tuple might match the qual).
+ */
+ else if (advancenonrequired &&
+ key->sk_strategy == BTEqualStrategyNumber &&
+ (key->sk_flags & SK_SEARCHARRAY))
+ return _bt_advance_array_keys(scan, NULL, tuple, tupnatts,
+ tupdesc, *ikey, false);
+
+ /*
+ * This indextuple doesn't match the qual.
*/
return false;
}
@@ -1574,7 +3881,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
* it's not possible for any future tuples in the current scan direction
* to pass the qual.
*
- * This is a subroutine for _bt_checkkeys, which see for more info.
+ * This is a subroutine for _bt_checkkeys/_bt_check_compare.
*/
static bool
_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
@@ -1603,7 +3910,6 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
* right could be any possible value. Assume that truncated
* attribute passes the qual.
*/
- Assert(ScanDirectionIsForward(dir));
Assert(BTreeTupleIsPivot(tuple));
cmpresult = 0;
if (subkey->sk_flags & SK_ROW_END)
@@ -1630,6 +3936,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
* because it's not possible for any future tuples to pass. On
* a forward scan, however, we must keep going, because we may
* have initially positioned to the start of the index.
+ * (_bt_advance_array_keys also relies on this behavior during
+ * forward scans.)
*/
if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
ScanDirectionIsBackward(dir))
@@ -1646,6 +3954,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
* because it's not possible for any future tuples to pass. On
* a backward scan, however, we must keep going, because we
* may have initially positioned to the end of the index.
+ * (_bt_advance_array_keys also relies on this behavior during
+ * backward scans.)
*/
if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
ScanDirectionIsForward(dir))
@@ -1741,6 +4051,90 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
return result;
}
+/*
+ * Determine if a scan with array keys should skip over uninteresting tuples.
+ *
+ * This is a subroutine for _bt_checkkeys. Called when _bt_readpage's linear
+ * search process (started after it finishes reading an initial group of
+ * matching tuples, used to locate the start of the next group of tuples
+ * matching the next set of required array keys) has already scanned an
+ * excessive number of tuples whose key space is "between arrays".
+ *
+ * When we perform look ahead successfully, we'll sets pstate.skip, which
+ * instructs _bt_readpage to skip ahead to that tuple next (could be past the
+ * end of the scan's leaf page). Pages where the optimization is effective
+ * will generally still need to skip several times. Each call here performs
+ * only a single "look ahead" comparison of a later tuple, whose distance from
+ * the current tuple's offset number is determined by applying heuristics.
+ */
+static void
+_bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
+ int tupnatts, TupleDesc tupdesc)
+{
+ ScanDirection dir = pstate->dir;
+ OffsetNumber aheadoffnum;
+ IndexTuple ahead;
+
+ /* Avoid looking ahead when comparing the page high key */
+ if (pstate->offnum < pstate->minoff)
+ return;
+
+ /*
+ * Don't look ahead when there aren't enough tuples remaining on the page
+ * (in the current scan direction) for it to be worth our while
+ */
+ if (ScanDirectionIsForward(dir) &&
+ pstate->offnum >= pstate->maxoff - LOOK_AHEAD_DEFAULT_DISTANCE)
+ return;
+ else if (ScanDirectionIsBackward(dir) &&
+ pstate->offnum <= pstate->minoff + LOOK_AHEAD_DEFAULT_DISTANCE)
+ return;
+
+ /*
+ * The look ahead distance starts small, and ramps up as each call here
+ * allows _bt_readpage to skip over more tuples
+ */
+ if (!pstate->targetdistance)
+ pstate->targetdistance = LOOK_AHEAD_DEFAULT_DISTANCE;
+ else
+ pstate->targetdistance *= 2;
+
+ /* Don't read past the end (or before the start) of the page, though */
+ if (ScanDirectionIsForward(dir))
+ aheadoffnum = Min((int) pstate->maxoff,
+ (int) pstate->offnum + pstate->targetdistance);
+ else
+ aheadoffnum = Max((int) pstate->minoff,
+ (int) pstate->offnum - pstate->targetdistance);
+
+ ahead = (IndexTuple) PageGetItem(pstate->page,
+ PageGetItemId(pstate->page, aheadoffnum));
+ if (_bt_tuple_before_array_skeys(scan, dir, ahead, tupdesc, tupnatts,
+ false, 0, NULL))
+ {
+ /*
+ * Success -- instruct _bt_readpage to skip ahead to very next tuple
+ * after the one we determined was still before the current array keys
+ */
+ if (ScanDirectionIsForward(dir))
+ pstate->skip = aheadoffnum + 1;
+ else
+ pstate->skip = aheadoffnum - 1;
+ }
+ else
+ {
+ /*
+ * Failure -- "ahead" tuple is too far ahead (we were too aggresive).
+ *
+ * Reset the number of rechecks, and aggressively reduce the target
+ * distance (we're much more aggressive here than we were when the
+ * distance was initially ramped up).
+ */
+ pstate->rechecks = 0;
+ pstate->targetdistance = Max(pstate->targetdistance / 8, 1);
+ }
+}
+
/*
* _bt_killitems - set LP_DEAD state for items an indexscan caller has
* told us were killed
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 9e35aaf56e..fcf6d1d932 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -628,6 +628,8 @@ ExecIndexOnlyScanEstimate(IndexOnlyScanState *node,
EState *estate = node->ss.ps.state;
node->ioss_PscanLen = index_parallelscan_estimate(node->ioss_RelationDesc,
+ node->ioss_NumScanKeys,
+ node->ioss_NumOrderByKeys,
estate->es_snapshot);
shm_toc_estimate_chunk(&pcxt->estimator, node->ioss_PscanLen);
shm_toc_estimate_keys(&pcxt->estimator, 1);
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 2a3264599d..8000feff4c 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -1644,6 +1644,8 @@ ExecIndexScanEstimate(IndexScanState *node,
EState *estate = node->ss.ps.state;
node->iss_PscanLen = index_parallelscan_estimate(node->iss_RelationDesc,
+ node->iss_NumScanKeys,
+ node->iss_NumOrderByKeys,
estate->es_snapshot);
shm_toc_estimate_chunk(&pcxt->estimator, node->iss_PscanLen);
shm_toc_estimate_keys(&pcxt->estimator, 1);
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 32c6a8bbdc..2230b13104 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -106,8 +106,7 @@ static List *build_index_paths(PlannerInfo *root, RelOptInfo *rel,
IndexOptInfo *index, IndexClauseSet *clauses,
bool useful_predicate,
ScanTypeControl scantype,
- bool *skip_nonnative_saop,
- bool *skip_lower_saop);
+ bool *skip_nonnative_saop);
static List *build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel,
List *clauses, List *other_clauses);
static List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
@@ -706,8 +705,6 @@ eclass_already_used(EquivalenceClass *parent_ec, Relids oldrelids,
* index AM supports them natively, we should just include them in simple
* index paths. If not, we should exclude them while building simple index
* paths, and then make a separate attempt to include them in bitmap paths.
- * Furthermore, we should consider excluding lower-order ScalarArrayOpExpr
- * quals so as to create ordered paths.
*/
static void
get_index_paths(PlannerInfo *root, RelOptInfo *rel,
@@ -716,37 +713,17 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
{
List *indexpaths;
bool skip_nonnative_saop = false;
- bool skip_lower_saop = false;
ListCell *lc;
/*
* Build simple index paths using the clauses. Allow ScalarArrayOpExpr
- * clauses only if the index AM supports them natively, and skip any such
- * clauses for index columns after the first (so that we produce ordered
- * paths if possible).
+ * clauses only if the index AM supports them natively.
*/
indexpaths = build_index_paths(root, rel,
index, clauses,
index->predOK,
ST_ANYSCAN,
- &skip_nonnative_saop,
- &skip_lower_saop);
-
- /*
- * If we skipped any lower-order ScalarArrayOpExprs on an index with an AM
- * that supports them, then try again including those clauses. This will
- * produce paths with more selectivity but no ordering.
- */
- if (skip_lower_saop)
- {
- indexpaths = list_concat(indexpaths,
- build_index_paths(root, rel,
- index, clauses,
- index->predOK,
- ST_ANYSCAN,
- &skip_nonnative_saop,
- NULL));
- }
+ &skip_nonnative_saop);
/*
* Submit all the ones that can form plain IndexScan plans to add_path. (A
@@ -784,7 +761,6 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
index, clauses,
false,
ST_BITMAPSCAN,
- NULL,
NULL);
*bitindexpaths = list_concat(*bitindexpaths, indexpaths);
}
@@ -817,27 +793,19 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
* to true if we found any such clauses (caller must initialize the variable
* to false). If it's NULL, we do not ignore ScalarArrayOpExpr clauses.
*
- * If skip_lower_saop is non-NULL, we ignore ScalarArrayOpExpr clauses for
- * non-first index columns, and we set *skip_lower_saop to true if we found
- * any such clauses (caller must initialize the variable to false). If it's
- * NULL, we do not ignore non-first ScalarArrayOpExpr clauses, but they will
- * result in considering the scan's output to be unordered.
- *
* 'rel' is the index's heap relation
* 'index' is the index for which we want to generate paths
* 'clauses' is the collection of indexable clauses (IndexClause nodes)
* 'useful_predicate' indicates whether the index has a useful predicate
* 'scantype' indicates whether we need plain or bitmap scan support
* 'skip_nonnative_saop' indicates whether to accept SAOP if index AM doesn't
- * 'skip_lower_saop' indicates whether to accept non-first-column SAOP
*/
static List *
build_index_paths(PlannerInfo *root, RelOptInfo *rel,
IndexOptInfo *index, IndexClauseSet *clauses,
bool useful_predicate,
ScanTypeControl scantype,
- bool *skip_nonnative_saop,
- bool *skip_lower_saop)
+ bool *skip_nonnative_saop)
{
List *result = NIL;
IndexPath *ipath;
@@ -848,12 +816,13 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
List *orderbyclausecols;
List *index_pathkeys;
List *useful_pathkeys;
- bool found_lower_saop_clause;
bool pathkeys_possibly_useful;
bool index_is_ordered;
bool index_only_scan;
int indexcol;
+ Assert(skip_nonnative_saop != NULL || scantype == ST_BITMAPSCAN);
+
/*
* Check that index supports the desired scan type(s)
*/
@@ -880,19 +849,11 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
* on by btree and possibly other places.) The list can be empty, if the
* index AM allows that.
*
- * found_lower_saop_clause is set true if we accept a ScalarArrayOpExpr
- * index clause for a non-first index column. This prevents us from
- * assuming that the scan result is ordered. (Actually, the result is
- * still ordered if there are equality constraints for all earlier
- * columns, but it seems too expensive and non-modular for this code to be
- * aware of that refinement.)
- *
* We also build a Relids set showing which outer rels are required by the
* selected clauses. Any lateral_relids are included in that, but not
* otherwise accounted for.
*/
index_clauses = NIL;
- found_lower_saop_clause = false;
outer_relids = bms_copy(rel->lateral_relids);
for (indexcol = 0; indexcol < index->nkeycolumns; indexcol++)
{
@@ -903,30 +864,18 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
IndexClause *iclause = (IndexClause *) lfirst(lc);
RestrictInfo *rinfo = iclause->rinfo;
- /* We might need to omit ScalarArrayOpExpr clauses */
- if (IsA(rinfo->clause, ScalarArrayOpExpr))
+ if (skip_nonnative_saop && !index->amsearcharray &&
+ IsA(rinfo->clause, ScalarArrayOpExpr))
{
- if (!index->amsearcharray)
- {
- if (skip_nonnative_saop)
- {
- /* Ignore because not supported by index */
- *skip_nonnative_saop = true;
- continue;
- }
- /* Caller had better intend this only for bitmap scan */
- Assert(scantype == ST_BITMAPSCAN);
- }
- if (indexcol > 0)
- {
- if (skip_lower_saop)
- {
- /* Caller doesn't want to lose index ordering */
- *skip_lower_saop = true;
- continue;
- }
- found_lower_saop_clause = true;
- }
+ /*
+ * Caller asked us to generate IndexPaths that omit any
+ * ScalarArrayOpExpr clauses when the underlying index AM
+ * lacks native support.
+ *
+ * We must omit this clause (and tell caller about it).
+ */
+ *skip_nonnative_saop = true;
+ continue;
}
/* OK to include this clause */
@@ -956,11 +905,9 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
/*
* 2. Compute pathkeys describing index's ordering, if any, then see how
* many of them are actually useful for this query. This is not relevant
- * if we are only trying to build bitmap indexscans, nor if we have to
- * assume the scan is unordered.
+ * if we are only trying to build bitmap indexscans.
*/
pathkeys_possibly_useful = (scantype != ST_BITMAPSCAN &&
- !found_lower_saop_clause &&
has_useful_pathkeys(root, rel));
index_is_ordered = (index->sortopfamily != NULL);
if (index_is_ordered && pathkeys_possibly_useful)
@@ -1212,7 +1159,6 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel,
index, &clauseset,
useful_predicate,
ST_BITMAPSCAN,
- NULL,
NULL);
result = list_concat(result, indexpaths);
}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index cea777e9d4..35f8f306ee 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6572,21 +6572,26 @@ genericcostestimate(PlannerInfo *root,
selectivityQuals = add_predicate_to_index_quals(index, indexQuals);
/*
- * Check for ScalarArrayOpExpr index quals, and estimate the number of
- * index scans that will be performed.
+ * If caller didn't give us an estimate for ScalarArrayOpExpr index scans,
+ * just assume that the number of index descents is the number of distinct
+ * combinations of array elements from all of the scan's SAOP clauses.
*/
- num_sa_scans = 1;
- foreach(l, indexQuals)
+ num_sa_scans = costs->num_sa_scans;
+ if (num_sa_scans < 1)
{
- RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
-
- if (IsA(rinfo->clause, ScalarArrayOpExpr))
+ num_sa_scans = 1;
+ foreach(l, indexQuals)
{
- ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause;
- double alength = estimate_array_length(root, lsecond(saop->args));
+ RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
- if (alength > 1)
- num_sa_scans *= alength;
+ if (IsA(rinfo->clause, ScalarArrayOpExpr))
+ {
+ ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause;
+ double alength = estimate_array_length(root, lsecond(saop->args));
+
+ if (alength > 1)
+ num_sa_scans *= alength;
+ }
}
}
@@ -6813,9 +6818,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
* For a RowCompareExpr, we consider only the first column, just as
* rowcomparesel() does.
*
- * If there's a ScalarArrayOpExpr in the quals, we'll actually perform N
- * index scans not one, but the ScalarArrayOpExpr's operator can be
- * considered to act the same as it normally does.
+ * If there's a ScalarArrayOpExpr in the quals, we'll actually perform up
+ * to N index descents (not just one), but the ScalarArrayOpExpr's
+ * operator can be considered to act the same as it normally does.
*/
indexBoundQuals = NIL;
indexcol = 0;
@@ -6867,7 +6872,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
clause_op = saop->opno;
found_saop = true;
- /* count number of SA scans induced by indexBoundQuals only */
+ /* estimate SA descents by indexBoundQuals only */
if (alength > 1)
num_sa_scans *= alength;
}
@@ -6930,10 +6935,48 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
NULL);
numIndexTuples = btreeSelectivity * index->rel->tuples;
+ /*
+ * btree automatically combines individual ScalarArrayOpExpr primitive
+ * index scans whenever the tuples covered by the next set of array
+ * keys are close to tuples covered by the current set. That puts a
+ * natural ceiling on the worst case number of descents -- there
+ * cannot possibly be more than one descent per leaf page scanned.
+ *
+ * Clamp the number of descents to at most 1/3 the number of index
+ * pages. This avoids implausibly high estimates with low selectivity
+ * paths, where scans usually require only one or two descents. This
+ * is most likely to help when there are several SAOP clauses, where
+ * naively accepting the total number of distinct combinations of
+ * array elements as the number of descents would frequently lead to
+ * wild overestimates.
+ *
+ * We somewhat arbitrarily don't just make the cutoff the total number
+ * of leaf pages (we make it 1/3 the total number of pages instead) to
+ * give the btree code credit for its ability to continue on the leaf
+ * level with low selectivity scans.
+ */
+ num_sa_scans = Min(num_sa_scans, ceil(index->pages * 0.3333333));
+ num_sa_scans = Max(num_sa_scans, 1);
+
/*
* As in genericcostestimate(), we have to adjust for any
* ScalarArrayOpExpr quals included in indexBoundQuals, and then round
* to integer.
+ *
+ * It is tempting to make genericcostestimate behave as if SAOP
+ * clauses work in almost the same way as scalar operators during
+ * btree scans, making the top-level scan look like a continuous scan
+ * (as opposed to num_sa_scans-many primitive index scans). After
+ * all, btree scans mostly work like that at runtime. However, such a
+ * scheme would badly bias genericcostestimate's simplistic appraoch
+ * to calculating numIndexPages through prorating.
+ *
+ * Stick with the approach taken by non-native SAOP scans for now.
+ * genericcostestimate will use the Mackert-Lohman formula to
+ * compensate for repeat page fetches, even though that definitely
+ * won't happen during btree scans (not for leaf pages, at least).
+ * We're usually very pessimistic about the number of primitive index
+ * scans that will be required, but it's not clear how to do better.
*/
numIndexTuples = rint(numIndexTuples / num_sa_scans);
}
@@ -6942,6 +6985,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
* Now do generic index cost estimation.
*/
costs.numIndexTuples = numIndexTuples;
+ costs.num_sa_scans = num_sa_scans;
genericcostestimate(root, path, loop_count, &costs);
@@ -6952,9 +6996,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
* comparisons to descend a btree of N leaf tuples. We charge one
* cpu_operator_cost per comparison.
*
- * If there are ScalarArrayOpExprs, charge this once per SA scan. The
- * ones after the first one are not startup cost so far as the overall
- * plan is concerned, so add them only to "total" cost.
+ * If there are ScalarArrayOpExprs, charge this once per estimated SA
+ * index descent. The ones after the first one are not startup cost so
+ * far as the overall plan goes, so just add them to "total" cost.
*/
if (index->tuples > 1) /* avoid computing log(0) */
{
@@ -6971,7 +7015,8 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
* in cases where only a single leaf page is expected to be visited. This
* cost is somewhat arbitrarily set at 50x cpu_operator_cost per page
* touched. The number of such pages is btree tree height plus one (ie,
- * we charge for the leaf page too). As above, charge once per SA scan.
+ * we charge for the leaf page too). As above, charge once per estimated
+ * SA index descent.
*/
descentCost = (index->tree_height + 1) * DEFAULT_PAGE_CPU_MULTIPLIER * cpu_operator_cost;
costs.indexStartupCost += descentCost;
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 2c6c307efc..00300dd720 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -194,7 +194,7 @@ typedef void (*amrestrpos_function) (IndexScanDesc scan);
*/
/* estimate size of parallel scan descriptor */
-typedef Size (*amestimateparallelscan_function) (void);
+typedef Size (*amestimateparallelscan_function) (int nkeys, int norderbys);
/* prepare for parallel index scan */
typedef void (*aminitparallelscan_function) (void *target);
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 8026c2b36d..fdcfbe8db7 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -165,7 +165,8 @@ extern void index_rescan(IndexScanDesc scan,
extern void index_endscan(IndexScanDesc scan);
extern void index_markpos(IndexScanDesc scan);
extern void index_restrpos(IndexScanDesc scan);
-extern Size index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot);
+extern Size index_parallelscan_estimate(Relation indexRelation,
+ int nkeys, int norderbys, Snapshot snapshot);
extern void index_parallelscan_initialize(Relation heapRelation,
Relation indexRelation, Snapshot snapshot,
ParallelIndexScanDesc target);
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 6eb162052e..b9053219a6 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -960,11 +960,20 @@ typedef struct BTScanPosData
* moreLeft and moreRight track whether we think there may be matching
* index entries to the left and right of the current page, respectively.
* We can clear the appropriate one of these flags when _bt_checkkeys()
- * returns continuescan = false.
+ * sets BTReadPageState.continuescan = false.
*/
bool moreLeft;
bool moreRight;
+ /*
+ * Direction of the scan at the time that _bt_readpage was called.
+ *
+ * Used by btrestrpos to "restore" the scan's array keys by resetting each
+ * array to its first element's value (first in this scan direction). This
+ * avoids the need to directly track the array keys in btmarkpos.
+ */
+ ScanDirection dir;
+
/*
* If we are doing an index-only scan, nextTupleOffset is the first free
* location in the associated tuple storage workspace.
@@ -1022,9 +1031,8 @@ typedef BTScanPosData *BTScanPos;
/* We need one of these for each equality-type SK_SEARCHARRAY scan key */
typedef struct BTArrayKeyInfo
{
- int scan_key; /* index of associated key in arrayKeyData */
+ int scan_key; /* index of associated key in keyData */
int cur_elem; /* index of current element in elem_values */
- int mark_elem; /* index of marked element in elem_values */
int num_elems; /* number of elems in current array value */
Datum *elem_values; /* array of num_elems Datums */
} BTArrayKeyInfo;
@@ -1037,14 +1045,11 @@ typedef struct BTScanOpaqueData
ScanKey keyData; /* array of preprocessed scan keys */
/* workspace for SK_SEARCHARRAY support */
- ScanKey arrayKeyData; /* modified copy of scan->keyData */
- bool arraysStarted; /* Started array keys, but have yet to "reach
- * past the end" of all arrays? */
- int numArrayKeys; /* number of equality-type array keys (-1 if
- * there are any unsatisfiable array keys) */
- int arrayKeyCount; /* count indicating number of array scan keys
- * processed */
+ int numArrayKeys; /* number of equality-type array keys */
+ bool needPrimScan; /* New prim scan to continue in current dir? */
+ bool scanBehind; /* Last array advancement matched -inf attr? */
BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */
+ FmgrInfo *orderProcs; /* ORDER procs for required equality keys */
MemoryContext arrayContext; /* scan-lifespan context for array data */
/* info about killed items if any (killedItems is NULL if never used) */
@@ -1075,6 +1080,42 @@ typedef struct BTScanOpaqueData
typedef BTScanOpaqueData *BTScanOpaque;
+/*
+ * _bt_readpage state used across _bt_checkkeys calls for a page
+ */
+typedef struct BTReadPageState
+{
+ /* Input parameters, set by _bt_readpage for _bt_checkkeys */
+ ScanDirection dir; /* current scan direction */
+ OffsetNumber minoff; /* Lowest non-pivot tuple's offset */
+ OffsetNumber maxoff; /* Highest non-pivot tuple's offset */
+ IndexTuple finaltup; /* Needed by scans with array keys */
+ BlockNumber prev_scan_page; /* previous _bt_parallel_release block */
+ Page page; /* Page being read */
+
+ /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */
+ OffsetNumber offnum; /* current tuple's page offset number */
+
+ /* Output parameter, set by _bt_checkkeys for _bt_readpage */
+ OffsetNumber skip; /* Array keys "look ahead" skip offnum */
+ bool continuescan; /* Terminate ongoing (primitive) index scan? */
+
+ /*
+ * Input and output parameters, set and unset by both _bt_readpage and
+ * _bt_checkkeys to manage precheck optimizations
+ */
+ bool prechecked; /* precheck set continuescan to 'true'? */
+ bool firstmatch; /* at least one match so far? */
+
+ /*
+ * Private _bt_checkkeys state used to manage "look ahead" optimization
+ * (only used during scans with array keys)
+ */
+ int16 rechecks;
+ int16 targetdistance;
+
+} BTReadPageState;
+
/*
* We use some private sk_flags bits in preprocessed scan keys. We're allowed
* to use bits 16-31 (see skey.h). The uppermost bits are copied from the
@@ -1128,7 +1169,7 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull,
bool indexUnchanged,
struct IndexInfo *indexInfo);
extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
-extern Size btestimateparallelscan(void);
+extern Size btestimateparallelscan(int nkeys, int norderbys);
extern void btinitparallelscan(void *target);
extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
@@ -1149,10 +1190,12 @@ extern bool btcanreturn(Relation index, int attno);
/*
* prototypes for internal functions in nbtree.c
*/
-extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno);
+extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno,
+ bool first);
extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
extern void _bt_parallel_done(IndexScanDesc scan);
-extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
+extern void _bt_parallel_primscan_schedule(IndexScanDesc scan,
+ BlockNumber prev_scan_page);
/*
* prototypes for functions in nbtdedup.c
@@ -1243,15 +1286,11 @@ extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
*/
extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup);
extern void _bt_freestack(BTStack stack);
-extern void _bt_preprocess_array_keys(IndexScanDesc scan);
+extern bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir);
extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
-extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
-extern void _bt_mark_array_keys(IndexScanDesc scan);
-extern void _bt_restore_array_keys(IndexScanDesc scan);
extern void _bt_preprocess_keys(IndexScanDesc scan);
-extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
- int tupnatts, ScanDirection dir, bool *continuescan,
- bool requiredMatchedByPrecheck, bool haveFirstMatch);
+extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
+ IndexTuple tuple, int tupnatts);
extern void _bt_killitems(IndexScanDesc scan);
extern BTCycleId _bt_vacuum_cycleid(Relation rel);
extern BTCycleId _bt_start_vacuum(Relation rel);
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 2fa4c4fc1b..f2563ad1cb 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -117,6 +117,9 @@ typedef struct VariableStatData
* Callers should initialize all fields of GenericCosts to zero. In addition,
* they can set numIndexTuples to some positive value if they have a better
* than default way of estimating the number of leaf index tuples visited.
+ * Similarly, they can set num_sa_scans to some value >= 1 for an index AM
+ * that doesn't necessarily perform exactly one primitive index scan per
+ * distinct combination of ScalarArrayOp array elements.
*/
typedef struct
{
diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out
index 8311a03c3d..510646cbce 100644
--- a/src/test/regress/expected/btree_index.out
+++ b/src/test/regress/expected/btree_index.out
@@ -189,6 +189,58 @@ select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limi
48 | 8
(1 row)
+--
+-- Add coverage for ScalarArrayOp btree quals with pivot tuple constants
+--
+explain (costs off)
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82);
+ QUERY PLAN
+------------------------------------------------------------------
+ Unique
+ -> Index Only Scan using tenk1_hundred on tenk1
+ Index Cond: (hundred = ANY ('{47,48,72,82}'::integer[]))
+(3 rows)
+
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82);
+ hundred
+---------
+ 47
+ 48
+ 72
+ 82
+(4 rows)
+
+explain (costs off)
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc;
+ QUERY PLAN
+------------------------------------------------------------------
+ Unique
+ -> Index Only Scan Backward using tenk1_hundred on tenk1
+ Index Cond: (hundred = ANY ('{47,48,72,82}'::integer[]))
+(3 rows)
+
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc;
+ hundred
+---------
+ 82
+ 72
+ 48
+ 47
+(4 rows)
+
+explain (costs off)
+select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000;
+ QUERY PLAN
+---------------------------------------------------------------------------------------
+ Index Only Scan using tenk1_thous_tenthous on tenk1
+ Index Cond: ((thousand = ANY ('{364,366,380}'::integer[])) AND (tenthous = 200000))
+(2 rows)
+
+select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000;
+ thousand
+----------
+(0 rows)
+
--
-- Check correct optimization of LIKE (special index operator support)
-- for both indexscan and bitmapscan cases
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index 70ab47a92f..cf6eac5734 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -1698,6 +1698,12 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500;
0
(1 row)
+SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IN (-1, 0, 1);
+ count
+-------
+ 1
+(1 row)
+
DROP INDEX onek_nulltest;
CREATE UNIQUE INDEX onek_nulltest ON onek_with_null (unique2 desc nulls last,unique1);
SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL;
@@ -1910,7 +1916,7 @@ SELECT count(*) FROM dupindexcols
(1 row)
--
--- Check ordering of =ANY indexqual results (bug in 9.2.0)
+-- Check that index scans with =ANY indexquals return rows in index order
--
explain (costs off)
SELECT unique1 FROM tenk1
@@ -1932,49 +1938,186 @@ ORDER BY unique1;
42
(3 rows)
+-- Non-required array scan key on "tenthous":
explain (costs off)
SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand;
- QUERY PLAN
--------------------------------------------------------
+ QUERY PLAN
+--------------------------------------------------------------------------------
Index Only Scan using tenk1_thous_tenthous on tenk1
- Index Cond: (thousand < 2)
- Filter: (tenthous = ANY ('{1001,3000}'::integer[]))
+ Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[])))
+(2 rows)
+
+SELECT thousand, tenthous FROM tenk1
+WHERE thousand < 2 AND tenthous IN (1001,3000)
+ORDER BY thousand;
+ thousand | tenthous
+----------+----------
+ 0 | 3000
+ 1 | 1001
+(2 rows)
+
+-- Non-required array scan key on "tenthous", backward scan:
+explain (costs off)
+SELECT thousand, tenthous FROM tenk1
+WHERE thousand < 2 AND tenthous IN (1001,3000)
+ORDER BY thousand DESC, tenthous DESC;
+ QUERY PLAN
+--------------------------------------------------------------------------------
+ Index Only Scan Backward using tenk1_thous_tenthous on tenk1
+ Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[])))
+(2 rows)
+
+SELECT thousand, tenthous FROM tenk1
+WHERE thousand < 2 AND tenthous IN (1001,3000)
+ORDER BY thousand DESC, tenthous DESC;
+ thousand | tenthous
+----------+----------
+ 1 | 1001
+ 0 | 3000
+(2 rows)
+
+--
+-- Check elimination of redundant and contradictory index quals
+--
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}');
+ QUERY PLAN
+----------------------------------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+ Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 = ANY ('{7,8,9}'::integer[])))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}');
+ unique1
+---------
+ 7
+(1 row)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]);
+ QUERY PLAN
+----------------------------------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+ Index Cond: ((unique1 = ANY ('{7,14,22}'::integer[])) AND (unique1 = ANY ('{33,44}'::bigint[])))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]);
+ unique1
+---------
+(0 rows)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1;
+ QUERY PLAN
+---------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+ Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 = 1))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1;
+ unique1
+---------
+ 1
+(1 row)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345;
+ QUERY PLAN
+-------------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+ Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 = 12345))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345;
+ unique1
+---------
+(0 rows)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42;
+ QUERY PLAN
+-----------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+ Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 >= 42))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42;
+ unique1
+---------
+ 42
+(1 row)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42;
+ QUERY PLAN
+----------------------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+ Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 > 42))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42;
+ unique1
+---------
+(0 rows)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999;
+ QUERY PLAN
+--------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+ Index Cond: ((unique1 > 9996) AND (unique1 >= 9999))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999;
+ unique1
+---------
+ 9999
+(1 row)
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3;
+ QUERY PLAN
+--------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+ Index Cond: ((unique1 < 3) AND (unique1 <= 3))
+(2 rows)
+
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3;
+ unique1
+---------
+ 0
+ 1
+ 2
(3 rows)
-SELECT thousand, tenthous FROM tenk1
-WHERE thousand < 2 AND tenthous IN (1001,3000)
-ORDER BY thousand;
- thousand | tenthous
-----------+----------
- 0 | 3000
- 1 | 1001
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint;
+ QUERY PLAN
+------------------------------------------------------------
+ Index Only Scan using tenk1_unique1 on tenk1
+ Index Cond: ((unique1 < 3) AND (unique1 < '-1'::bigint))
(2 rows)
-SET enable_indexonlyscan = OFF;
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint;
+ unique1
+---------
+(0 rows)
+
explain (costs off)
-SELECT thousand, tenthous FROM tenk1
-WHERE thousand < 2 AND tenthous IN (1001,3000)
-ORDER BY thousand;
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint;
QUERY PLAN
--------------------------------------------------------------------------------------
- Sort
- Sort Key: thousand
- -> Index Scan using tenk1_thous_tenthous on tenk1
- Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[])))
-(4 rows)
-
-SELECT thousand, tenthous FROM tenk1
-WHERE thousand < 2 AND tenthous IN (1001,3000)
-ORDER BY thousand;
- thousand | tenthous
-----------+----------
- 0 | 3000
- 1 | 1001
+ Index Only Scan using tenk1_unique1 on tenk1
+ Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 < '-1'::bigint))
(2 rows)
-RESET enable_indexonlyscan;
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint;
+ unique1
+---------
+(0 rows)
+
--
-- Check elimination of constant-NULL subexpressions
--
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 63cddac0d6..8b640c2fc2 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -8880,10 +8880,9 @@ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1 and j2.id1 >= any (array[1,5]);
Merge Cond: (j1.id1 = j2.id1)
Join Filter: (j2.id2 = j1.id2)
-> Index Scan using j1_id1_idx on j1
- -> Index Only Scan using j2_pkey on j2
+ -> Index Scan using j2_id1_idx on j2
Index Cond: (id1 >= ANY ('{1,5}'::integer[]))
- Filter: ((id1 % 1000) = 1)
-(7 rows)
+(6 rows)
select * from j1
inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out
index 4ffc5b4c56..87273fa635 100644
--- a/src/test/regress/expected/select_parallel.out
+++ b/src/test/regress/expected/select_parallel.out
@@ -361,6 +361,7 @@ alter table tenk2 reset (parallel_workers);
-- test parallel index scans.
set enable_seqscan to off;
set enable_bitmapscan to off;
+set random_page_cost = 2;
explain (costs off)
select count((unique1)) from tenk1 where hundred > 1;
QUERY PLAN
@@ -379,6 +380,30 @@ select count((unique1)) from tenk1 where hundred > 1;
9800
(1 row)
+-- Parallel ScalarArrayOp index scan
+explain (costs off)
+ select count((unique1)) from tenk1
+ where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]);
+ QUERY PLAN
+---------------------------------------------------------------------
+ Finalize Aggregate
+ InitPlan 1
+ -> Aggregate
+ -> Function Scan on generate_series i
+ -> Gather
+ Workers Planned: 4
+ -> Partial Aggregate
+ -> Parallel Index Scan using tenk1_hundred on tenk1
+ Index Cond: (hundred = ANY ((InitPlan 1).col1))
+(9 rows)
+
+select count((unique1)) from tenk1
+where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]);
+ count
+-------
+ 700
+(1 row)
+
-- test parallel index-only scans.
explain (costs off)
select count(*) from tenk1 where thousand > 95;
diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql
index ef84354234..0d2a33f370 100644
--- a/src/test/regress/sql/btree_index.sql
+++ b/src/test/regress/sql/btree_index.sql
@@ -135,6 +135,21 @@ explain (costs off)
select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limit 1;
select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limit 1;
+--
+-- Add coverage for ScalarArrayOp btree quals with pivot tuple constants
+--
+explain (costs off)
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82);
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82);
+
+explain (costs off)
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc;
+select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc;
+
+explain (costs off)
+select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000;
+select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000;
+
--
-- Check correct optimization of LIKE (special index operator support)
-- for both indexscan and bitmapscan cases
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index d49ce9f300..e296891cab 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -668,6 +668,7 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL;
SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NOT NULL;
SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL AND unique1 > 500;
SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500;
+SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IN (-1, 0, 1);
DROP INDEX onek_nulltest;
@@ -753,7 +754,7 @@ SELECT count(*) FROM dupindexcols
WHERE f1 BETWEEN 'WA' AND 'ZZZ' and id < 1000 and f1 ~<~ 'YX';
--
--- Check ordering of =ANY indexqual results (bug in 9.2.0)
+-- Check that index scans with =ANY indexquals return rows in index order
--
explain (costs off)
@@ -765,6 +766,7 @@ SELECT unique1 FROM tenk1
WHERE unique1 IN (1,42,7)
ORDER BY unique1;
+-- Non-required array scan key on "tenthous":
explain (costs off)
SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
@@ -774,18 +776,68 @@ SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
ORDER BY thousand;
-SET enable_indexonlyscan = OFF;
-
+-- Non-required array scan key on "tenthous", backward scan:
explain (costs off)
SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
-ORDER BY thousand;
+ORDER BY thousand DESC, tenthous DESC;
SELECT thousand, tenthous FROM tenk1
WHERE thousand < 2 AND tenthous IN (1001,3000)
-ORDER BY thousand;
+ORDER BY thousand DESC, tenthous DESC;
-RESET enable_indexonlyscan;
+--
+-- Check elimination of redundant and contradictory index quals
+--
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}');
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}');
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]);
+
+SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]);
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1;
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345;
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42;
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42;
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999;
+
+SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3;
+
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint;
+
+SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint;
+
+explain (costs off)
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint;
+
+SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint;
--
-- Check elimination of constant-NULL subexpressions
diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql
index c43a5b2119..20376c03fa 100644
--- a/src/test/regress/sql/select_parallel.sql
+++ b/src/test/regress/sql/select_parallel.sql
@@ -137,11 +137,19 @@ alter table tenk2 reset (parallel_workers);
-- test parallel index scans.
set enable_seqscan to off;
set enable_bitmapscan to off;
+set random_page_cost = 2;
explain (costs off)
select count((unique1)) from tenk1 where hundred > 1;
select count((unique1)) from tenk1 where hundred > 1;
+-- Parallel ScalarArrayOp index scan
+explain (costs off)
+ select count((unique1)) from tenk1
+ where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]);
+select count((unique1)) from tenk1
+where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]);
+
-- test parallel index-only scans.
explain (costs off)
select count(*) from tenk1 where thousand > 95;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 01845ee71d..f87e8b80ec 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -208,8 +208,10 @@ BTPageStat
BTPageState
BTParallelScanDesc
BTPendingFSM
+BTReadPageState
BTScanInsert
BTScanInsertData
+BTScanKeyPreproc
BTScanOpaque
BTScanOpaqueData
BTScanPos