2015-07-22 01:54:18 +08:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* _int_selfuncs.c
|
|
|
|
* Functions for selectivity estimation of intarray operators
|
|
|
|
*
|
2018-01-03 12:30:12 +08:00
|
|
|
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
|
2015-07-22 01:54:18 +08:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* contrib/intarray/_int_selfuncs.c
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "_int.h"
|
|
|
|
|
|
|
|
#include "access/htup_details.h"
|
|
|
|
#include "catalog/pg_operator.h"
|
|
|
|
#include "catalog/pg_statistic.h"
|
|
|
|
#include "catalog/pg_type.h"
|
2016-12-29 01:00:00 +08:00
|
|
|
#include "utils/builtins.h"
|
2015-07-22 01:54:18 +08:00
|
|
|
#include "utils/selfuncs.h"
|
|
|
|
#include "utils/syscache.h"
|
|
|
|
#include "utils/lsyscache.h"
|
|
|
|
#include "miscadmin.h"
|
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(_int_overlap_sel);
|
|
|
|
PG_FUNCTION_INFO_V1(_int_contains_sel);
|
|
|
|
PG_FUNCTION_INFO_V1(_int_contained_sel);
|
|
|
|
PG_FUNCTION_INFO_V1(_int_overlap_joinsel);
|
|
|
|
PG_FUNCTION_INFO_V1(_int_contains_joinsel);
|
|
|
|
PG_FUNCTION_INFO_V1(_int_contained_joinsel);
|
|
|
|
PG_FUNCTION_INFO_V1(_int_matchsel);
|
|
|
|
|
|
|
|
|
|
|
|
static Selectivity int_query_opr_selec(ITEM *item, Datum *values, float4 *freqs,
|
|
|
|
int nmncelems, float4 minfreq);
|
|
|
|
static int compare_val_int4(const void *a, const void *b);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wrappers around the default array selectivity estimation functions.
|
|
|
|
*
|
|
|
|
* The default array selectivity operators for the @>, && and @< operators
|
|
|
|
* work fine for integer arrays. However, if we tried to just use arraycontsel
|
|
|
|
* and arracontjoinsel directly as the cost estimator functions for our
|
|
|
|
* operators, they would not work as intended, because they look at the
|
|
|
|
* operator's OID. Our operators behave exactly like the built-in anyarray
|
|
|
|
* versions, but we must tell the cost estimator functions which built-in
|
|
|
|
* operators they correspond to. These wrappers just replace the operator
|
|
|
|
* OID with the corresponding built-in operator's OID, and call the built-in
|
|
|
|
* function.
|
|
|
|
*/
|
|
|
|
|
|
|
|
Datum
|
|
|
|
_int_overlap_sel(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
|
|
|
|
PG_GETARG_DATUM(0),
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-22 03:35:54 +08:00
|
|
|
ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
|
2015-07-22 01:54:18 +08:00
|
|
|
PG_GETARG_DATUM(2),
|
|
|
|
PG_GETARG_DATUM(3)));
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
_int_contains_sel(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
|
|
|
|
PG_GETARG_DATUM(0),
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-22 03:35:54 +08:00
|
|
|
ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
|
2015-07-22 01:54:18 +08:00
|
|
|
PG_GETARG_DATUM(2),
|
|
|
|
PG_GETARG_DATUM(3)));
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
_int_contained_sel(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
|
|
|
|
PG_GETARG_DATUM(0),
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-22 03:35:54 +08:00
|
|
|
ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
|
2015-07-22 01:54:18 +08:00
|
|
|
PG_GETARG_DATUM(2),
|
|
|
|
PG_GETARG_DATUM(3)));
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
_int_overlap_joinsel(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
|
|
|
|
PG_GETARG_DATUM(0),
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-22 03:35:54 +08:00
|
|
|
ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
|
2015-07-22 01:54:18 +08:00
|
|
|
PG_GETARG_DATUM(2),
|
|
|
|
PG_GETARG_DATUM(3),
|
|
|
|
PG_GETARG_DATUM(4)));
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
_int_contains_joinsel(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
|
|
|
|
PG_GETARG_DATUM(0),
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-22 03:35:54 +08:00
|
|
|
ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
|
2015-07-22 01:54:18 +08:00
|
|
|
PG_GETARG_DATUM(2),
|
|
|
|
PG_GETARG_DATUM(3),
|
|
|
|
PG_GETARG_DATUM(4)));
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
_int_contained_joinsel(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
|
|
|
|
PG_GETARG_DATUM(0),
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-22 03:35:54 +08:00
|
|
|
ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
|
2015-07-22 01:54:18 +08:00
|
|
|
PG_GETARG_DATUM(2),
|
|
|
|
PG_GETARG_DATUM(3),
|
|
|
|
PG_GETARG_DATUM(4)));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* _int_matchsel -- restriction selectivity function for intarray @@ query_int
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
_int_matchsel(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
|
|
|
|
|
|
|
|
List *args = (List *) PG_GETARG_POINTER(2);
|
|
|
|
int varRelid = PG_GETARG_INT32(3);
|
|
|
|
VariableStatData vardata;
|
|
|
|
Node *other;
|
|
|
|
bool varonleft;
|
|
|
|
Selectivity selec;
|
|
|
|
QUERYTYPE *query;
|
|
|
|
Datum *mcelems = NULL;
|
|
|
|
float4 *mcefreqs = NULL;
|
|
|
|
int nmcelems = 0;
|
|
|
|
float4 minfreq = 0.0;
|
|
|
|
float4 nullfrac = 0.0;
|
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed.
The mess cleaned up in commit da0759600 is clear evidence that it's a
bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot()
to provide the correct type OID for the array elements in the slot.
Moreover, we weren't even getting any performance benefit from that,
since get_attstatsslot() was extracting the real type OID from the array
anyway. So we ought to get rid of that requirement; indeed, it would
make more sense for get_attstatsslot() to pass back the type OID it found,
in case the caller isn't sure what to expect, which is likely in binary-
compatible-operator cases.
Another problem with the current implementation is that if the stats array
element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle
for each element. That seemed acceptable when the code was written because
we were targeting O(10) array sizes --- but these days, stats arrays are
almost always bigger than that, sometimes much bigger. We can save a
significant number of cycles by doing one palloc/memcpy/pfree of the whole
array. Indeed, in the now-probably-common case where the array is toasted,
that happens anyway so this method is basically free. (Note: although the
catcache code will inline any out-of-line toasted values, it doesn't
decompress them. At the other end of the size range, it doesn't expand
short-header datums either. In either case, DatumGetArrayTypeP would have
to make a copy. We do end up using an extra array copy step if the element
type is pass-by-value and the array length is neither small enough for a
short header nor large enough to have suffered compression. But that
seems like a very acceptable price for winning in pass-by-ref cases.)
Hence, redesign to take these insights into account. While at it,
convert to an API in which we fill a struct rather than passing a bunch
of pointers to individual output arguments. That will make it less
painful if we ever want further expansion of what get_attstatsslot can
pass back.
It's certainly arguable that this is new development and not something to
push post-feature-freeze. However, I view it as primarily bug-proofing
and therefore something that's better to have sooner not later. Since
we aren't quite at beta phase yet, let's put it in.
Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-14 03:14:39 +08:00
|
|
|
AttStatsSlot sslot;
|
2015-07-22 01:54:18 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If expression is not "variable @@ something" or "something @@ variable"
|
|
|
|
* then punt and return a default estimate.
|
|
|
|
*/
|
|
|
|
if (!get_restriction_variable(root, args, varRelid,
|
|
|
|
&vardata, &other, &varonleft))
|
|
|
|
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Variable should be int[]. We don't support cases where variable is
|
|
|
|
* query_int.
|
|
|
|
*/
|
|
|
|
if (vardata.vartype != INT4ARRAYOID)
|
|
|
|
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Can't do anything useful if the something is not a constant, either.
|
|
|
|
*/
|
|
|
|
if (!IsA(other, Const))
|
|
|
|
{
|
|
|
|
ReleaseVariableStats(vardata);
|
|
|
|
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The "@@" operator is strict, so we can cope with NULL right away.
|
|
|
|
*/
|
|
|
|
if (((Const *) other)->constisnull)
|
|
|
|
{
|
|
|
|
ReleaseVariableStats(vardata);
|
|
|
|
PG_RETURN_FLOAT8(0.0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The caller made sure the const is a query, so get it now */
|
|
|
|
query = DatumGetQueryTypeP(((Const *) other)->constvalue);
|
|
|
|
|
|
|
|
/* Empty query matches nothing */
|
|
|
|
if (query->size == 0)
|
|
|
|
{
|
|
|
|
ReleaseVariableStats(vardata);
|
|
|
|
return (Selectivity) 0.0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the statistics for the intarray column.
|
|
|
|
*
|
|
|
|
* We're interested in the Most-Common-Elements list, and the NULL
|
|
|
|
* fraction.
|
|
|
|
*/
|
|
|
|
if (HeapTupleIsValid(vardata.statsTuple))
|
|
|
|
{
|
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed.
The mess cleaned up in commit da0759600 is clear evidence that it's a
bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot()
to provide the correct type OID for the array elements in the slot.
Moreover, we weren't even getting any performance benefit from that,
since get_attstatsslot() was extracting the real type OID from the array
anyway. So we ought to get rid of that requirement; indeed, it would
make more sense for get_attstatsslot() to pass back the type OID it found,
in case the caller isn't sure what to expect, which is likely in binary-
compatible-operator cases.
Another problem with the current implementation is that if the stats array
element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle
for each element. That seemed acceptable when the code was written because
we were targeting O(10) array sizes --- but these days, stats arrays are
almost always bigger than that, sometimes much bigger. We can save a
significant number of cycles by doing one palloc/memcpy/pfree of the whole
array. Indeed, in the now-probably-common case where the array is toasted,
that happens anyway so this method is basically free. (Note: although the
catcache code will inline any out-of-line toasted values, it doesn't
decompress them. At the other end of the size range, it doesn't expand
short-header datums either. In either case, DatumGetArrayTypeP would have
to make a copy. We do end up using an extra array copy step if the element
type is pass-by-value and the array length is neither small enough for a
short header nor large enough to have suffered compression. But that
seems like a very acceptable price for winning in pass-by-ref cases.)
Hence, redesign to take these insights into account. While at it,
convert to an API in which we fill a struct rather than passing a bunch
of pointers to individual output arguments. That will make it less
painful if we ever want further expansion of what get_attstatsslot can
pass back.
It's certainly arguable that this is new development and not something to
push post-feature-freeze. However, I view it as primarily bug-proofing
and therefore something that's better to have sooner not later. Since
we aren't quite at beta phase yet, let's put it in.
Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-14 03:14:39 +08:00
|
|
|
Form_pg_statistic stats;
|
|
|
|
|
2015-07-22 01:54:18 +08:00
|
|
|
stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
|
|
|
|
nullfrac = stats->stanullfrac;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For an int4 array, the default array type analyze function will
|
|
|
|
* collect a Most Common Elements list, which is an array of int4s.
|
|
|
|
*/
|
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed.
The mess cleaned up in commit da0759600 is clear evidence that it's a
bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot()
to provide the correct type OID for the array elements in the slot.
Moreover, we weren't even getting any performance benefit from that,
since get_attstatsslot() was extracting the real type OID from the array
anyway. So we ought to get rid of that requirement; indeed, it would
make more sense for get_attstatsslot() to pass back the type OID it found,
in case the caller isn't sure what to expect, which is likely in binary-
compatible-operator cases.
Another problem with the current implementation is that if the stats array
element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle
for each element. That seemed acceptable when the code was written because
we were targeting O(10) array sizes --- but these days, stats arrays are
almost always bigger than that, sometimes much bigger. We can save a
significant number of cycles by doing one palloc/memcpy/pfree of the whole
array. Indeed, in the now-probably-common case where the array is toasted,
that happens anyway so this method is basically free. (Note: although the
catcache code will inline any out-of-line toasted values, it doesn't
decompress them. At the other end of the size range, it doesn't expand
short-header datums either. In either case, DatumGetArrayTypeP would have
to make a copy. We do end up using an extra array copy step if the element
type is pass-by-value and the array length is neither small enough for a
short header nor large enough to have suffered compression. But that
seems like a very acceptable price for winning in pass-by-ref cases.)
Hence, redesign to take these insights into account. While at it,
convert to an API in which we fill a struct rather than passing a bunch
of pointers to individual output arguments. That will make it less
painful if we ever want further expansion of what get_attstatsslot can
pass back.
It's certainly arguable that this is new development and not something to
push post-feature-freeze. However, I view it as primarily bug-proofing
and therefore something that's better to have sooner not later. Since
we aren't quite at beta phase yet, let's put it in.
Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-14 03:14:39 +08:00
|
|
|
if (get_attstatsslot(&sslot, vardata.statsTuple,
|
2015-07-22 01:54:18 +08:00
|
|
|
STATISTIC_KIND_MCELEM, InvalidOid,
|
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed.
The mess cleaned up in commit da0759600 is clear evidence that it's a
bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot()
to provide the correct type OID for the array elements in the slot.
Moreover, we weren't even getting any performance benefit from that,
since get_attstatsslot() was extracting the real type OID from the array
anyway. So we ought to get rid of that requirement; indeed, it would
make more sense for get_attstatsslot() to pass back the type OID it found,
in case the caller isn't sure what to expect, which is likely in binary-
compatible-operator cases.
Another problem with the current implementation is that if the stats array
element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle
for each element. That seemed acceptable when the code was written because
we were targeting O(10) array sizes --- but these days, stats arrays are
almost always bigger than that, sometimes much bigger. We can save a
significant number of cycles by doing one palloc/memcpy/pfree of the whole
array. Indeed, in the now-probably-common case where the array is toasted,
that happens anyway so this method is basically free. (Note: although the
catcache code will inline any out-of-line toasted values, it doesn't
decompress them. At the other end of the size range, it doesn't expand
short-header datums either. In either case, DatumGetArrayTypeP would have
to make a copy. We do end up using an extra array copy step if the element
type is pass-by-value and the array length is neither small enough for a
short header nor large enough to have suffered compression. But that
seems like a very acceptable price for winning in pass-by-ref cases.)
Hence, redesign to take these insights into account. While at it,
convert to an API in which we fill a struct rather than passing a bunch
of pointers to individual output arguments. That will make it less
painful if we ever want further expansion of what get_attstatsslot can
pass back.
It's certainly arguable that this is new development and not something to
push post-feature-freeze. However, I view it as primarily bug-proofing
and therefore something that's better to have sooner not later. Since
we aren't quite at beta phase yet, let's put it in.
Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-14 03:14:39 +08:00
|
|
|
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
|
2015-07-22 01:54:18 +08:00
|
|
|
{
|
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed.
The mess cleaned up in commit da0759600 is clear evidence that it's a
bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot()
to provide the correct type OID for the array elements in the slot.
Moreover, we weren't even getting any performance benefit from that,
since get_attstatsslot() was extracting the real type OID from the array
anyway. So we ought to get rid of that requirement; indeed, it would
make more sense for get_attstatsslot() to pass back the type OID it found,
in case the caller isn't sure what to expect, which is likely in binary-
compatible-operator cases.
Another problem with the current implementation is that if the stats array
element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle
for each element. That seemed acceptable when the code was written because
we were targeting O(10) array sizes --- but these days, stats arrays are
almost always bigger than that, sometimes much bigger. We can save a
significant number of cycles by doing one palloc/memcpy/pfree of the whole
array. Indeed, in the now-probably-common case where the array is toasted,
that happens anyway so this method is basically free. (Note: although the
catcache code will inline any out-of-line toasted values, it doesn't
decompress them. At the other end of the size range, it doesn't expand
short-header datums either. In either case, DatumGetArrayTypeP would have
to make a copy. We do end up using an extra array copy step if the element
type is pass-by-value and the array length is neither small enough for a
short header nor large enough to have suffered compression. But that
seems like a very acceptable price for winning in pass-by-ref cases.)
Hence, redesign to take these insights into account. While at it,
convert to an API in which we fill a struct rather than passing a bunch
of pointers to individual output arguments. That will make it less
painful if we ever want further expansion of what get_attstatsslot can
pass back.
It's certainly arguable that this is new development and not something to
push post-feature-freeze. However, I view it as primarily bug-proofing
and therefore something that's better to have sooner not later. Since
we aren't quite at beta phase yet, let's put it in.
Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-14 03:14:39 +08:00
|
|
|
Assert(sslot.valuetype == INT4OID);
|
|
|
|
|
2015-07-22 01:54:18 +08:00
|
|
|
/*
|
|
|
|
* There should be three more Numbers than Values, because the
|
|
|
|
* last three (for intarray) cells are taken for minimal, maximal
|
|
|
|
* and nulls frequency. Punt if not.
|
|
|
|
*/
|
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed.
The mess cleaned up in commit da0759600 is clear evidence that it's a
bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot()
to provide the correct type OID for the array elements in the slot.
Moreover, we weren't even getting any performance benefit from that,
since get_attstatsslot() was extracting the real type OID from the array
anyway. So we ought to get rid of that requirement; indeed, it would
make more sense for get_attstatsslot() to pass back the type OID it found,
in case the caller isn't sure what to expect, which is likely in binary-
compatible-operator cases.
Another problem with the current implementation is that if the stats array
element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle
for each element. That seemed acceptable when the code was written because
we were targeting O(10) array sizes --- but these days, stats arrays are
almost always bigger than that, sometimes much bigger. We can save a
significant number of cycles by doing one palloc/memcpy/pfree of the whole
array. Indeed, in the now-probably-common case where the array is toasted,
that happens anyway so this method is basically free. (Note: although the
catcache code will inline any out-of-line toasted values, it doesn't
decompress them. At the other end of the size range, it doesn't expand
short-header datums either. In either case, DatumGetArrayTypeP would have
to make a copy. We do end up using an extra array copy step if the element
type is pass-by-value and the array length is neither small enough for a
short header nor large enough to have suffered compression. But that
seems like a very acceptable price for winning in pass-by-ref cases.)
Hence, redesign to take these insights into account. While at it,
convert to an API in which we fill a struct rather than passing a bunch
of pointers to individual output arguments. That will make it less
painful if we ever want further expansion of what get_attstatsslot can
pass back.
It's certainly arguable that this is new development and not something to
push post-feature-freeze. However, I view it as primarily bug-proofing
and therefore something that's better to have sooner not later. Since
we aren't quite at beta phase yet, let's put it in.
Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-14 03:14:39 +08:00
|
|
|
if (sslot.nnumbers == sslot.nvalues + 3)
|
2015-07-22 01:54:18 +08:00
|
|
|
{
|
|
|
|
/* Grab the lowest frequency. */
|
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed.
The mess cleaned up in commit da0759600 is clear evidence that it's a
bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot()
to provide the correct type OID for the array elements in the slot.
Moreover, we weren't even getting any performance benefit from that,
since get_attstatsslot() was extracting the real type OID from the array
anyway. So we ought to get rid of that requirement; indeed, it would
make more sense for get_attstatsslot() to pass back the type OID it found,
in case the caller isn't sure what to expect, which is likely in binary-
compatible-operator cases.
Another problem with the current implementation is that if the stats array
element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle
for each element. That seemed acceptable when the code was written because
we were targeting O(10) array sizes --- but these days, stats arrays are
almost always bigger than that, sometimes much bigger. We can save a
significant number of cycles by doing one palloc/memcpy/pfree of the whole
array. Indeed, in the now-probably-common case where the array is toasted,
that happens anyway so this method is basically free. (Note: although the
catcache code will inline any out-of-line toasted values, it doesn't
decompress them. At the other end of the size range, it doesn't expand
short-header datums either. In either case, DatumGetArrayTypeP would have
to make a copy. We do end up using an extra array copy step if the element
type is pass-by-value and the array length is neither small enough for a
short header nor large enough to have suffered compression. But that
seems like a very acceptable price for winning in pass-by-ref cases.)
Hence, redesign to take these insights into account. While at it,
convert to an API in which we fill a struct rather than passing a bunch
of pointers to individual output arguments. That will make it less
painful if we ever want further expansion of what get_attstatsslot can
pass back.
It's certainly arguable that this is new development and not something to
push post-feature-freeze. However, I view it as primarily bug-proofing
and therefore something that's better to have sooner not later. Since
we aren't quite at beta phase yet, let's put it in.
Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-14 03:14:39 +08:00
|
|
|
minfreq = sslot.numbers[sslot.nnumbers - (sslot.nnumbers - sslot.nvalues)];
|
2015-07-22 01:54:18 +08:00
|
|
|
|
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed.
The mess cleaned up in commit da0759600 is clear evidence that it's a
bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot()
to provide the correct type OID for the array elements in the slot.
Moreover, we weren't even getting any performance benefit from that,
since get_attstatsslot() was extracting the real type OID from the array
anyway. So we ought to get rid of that requirement; indeed, it would
make more sense for get_attstatsslot() to pass back the type OID it found,
in case the caller isn't sure what to expect, which is likely in binary-
compatible-operator cases.
Another problem with the current implementation is that if the stats array
element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle
for each element. That seemed acceptable when the code was written because
we were targeting O(10) array sizes --- but these days, stats arrays are
almost always bigger than that, sometimes much bigger. We can save a
significant number of cycles by doing one palloc/memcpy/pfree of the whole
array. Indeed, in the now-probably-common case where the array is toasted,
that happens anyway so this method is basically free. (Note: although the
catcache code will inline any out-of-line toasted values, it doesn't
decompress them. At the other end of the size range, it doesn't expand
short-header datums either. In either case, DatumGetArrayTypeP would have
to make a copy. We do end up using an extra array copy step if the element
type is pass-by-value and the array length is neither small enough for a
short header nor large enough to have suffered compression. But that
seems like a very acceptable price for winning in pass-by-ref cases.)
Hence, redesign to take these insights into account. While at it,
convert to an API in which we fill a struct rather than passing a bunch
of pointers to individual output arguments. That will make it less
painful if we ever want further expansion of what get_attstatsslot can
pass back.
It's certainly arguable that this is new development and not something to
push post-feature-freeze. However, I view it as primarily bug-proofing
and therefore something that's better to have sooner not later. Since
we aren't quite at beta phase yet, let's put it in.
Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-14 03:14:39 +08:00
|
|
|
mcelems = sslot.values;
|
|
|
|
mcefreqs = sslot.numbers;
|
|
|
|
nmcelems = sslot.nvalues;
|
2015-07-22 01:54:18 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed.
The mess cleaned up in commit da0759600 is clear evidence that it's a
bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot()
to provide the correct type OID for the array elements in the slot.
Moreover, we weren't even getting any performance benefit from that,
since get_attstatsslot() was extracting the real type OID from the array
anyway. So we ought to get rid of that requirement; indeed, it would
make more sense for get_attstatsslot() to pass back the type OID it found,
in case the caller isn't sure what to expect, which is likely in binary-
compatible-operator cases.
Another problem with the current implementation is that if the stats array
element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle
for each element. That seemed acceptable when the code was written because
we were targeting O(10) array sizes --- but these days, stats arrays are
almost always bigger than that, sometimes much bigger. We can save a
significant number of cycles by doing one palloc/memcpy/pfree of the whole
array. Indeed, in the now-probably-common case where the array is toasted,
that happens anyway so this method is basically free. (Note: although the
catcache code will inline any out-of-line toasted values, it doesn't
decompress them. At the other end of the size range, it doesn't expand
short-header datums either. In either case, DatumGetArrayTypeP would have
to make a copy. We do end up using an extra array copy step if the element
type is pass-by-value and the array length is neither small enough for a
short header nor large enough to have suffered compression. But that
seems like a very acceptable price for winning in pass-by-ref cases.)
Hence, redesign to take these insights into account. While at it,
convert to an API in which we fill a struct rather than passing a bunch
of pointers to individual output arguments. That will make it less
painful if we ever want further expansion of what get_attstatsslot can
pass back.
It's certainly arguable that this is new development and not something to
push post-feature-freeze. However, I view it as primarily bug-proofing
and therefore something that's better to have sooner not later. Since
we aren't quite at beta phase yet, let's put it in.
Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-14 03:14:39 +08:00
|
|
|
else
|
|
|
|
memset(&sslot, 0, sizeof(sslot));
|
2015-07-22 01:54:18 +08:00
|
|
|
|
|
|
|
/* Process the logical expression in the query, using the stats */
|
|
|
|
selec = int_query_opr_selec(GETQUERY(query) + query->size - 1,
|
|
|
|
mcelems, mcefreqs, nmcelems, minfreq);
|
|
|
|
|
|
|
|
/* MCE stats count only non-null rows, so adjust for null rows. */
|
|
|
|
selec *= (1.0 - nullfrac);
|
|
|
|
|
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed.
The mess cleaned up in commit da0759600 is clear evidence that it's a
bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot()
to provide the correct type OID for the array elements in the slot.
Moreover, we weren't even getting any performance benefit from that,
since get_attstatsslot() was extracting the real type OID from the array
anyway. So we ought to get rid of that requirement; indeed, it would
make more sense for get_attstatsslot() to pass back the type OID it found,
in case the caller isn't sure what to expect, which is likely in binary-
compatible-operator cases.
Another problem with the current implementation is that if the stats array
element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle
for each element. That seemed acceptable when the code was written because
we were targeting O(10) array sizes --- but these days, stats arrays are
almost always bigger than that, sometimes much bigger. We can save a
significant number of cycles by doing one palloc/memcpy/pfree of the whole
array. Indeed, in the now-probably-common case where the array is toasted,
that happens anyway so this method is basically free. (Note: although the
catcache code will inline any out-of-line toasted values, it doesn't
decompress them. At the other end of the size range, it doesn't expand
short-header datums either. In either case, DatumGetArrayTypeP would have
to make a copy. We do end up using an extra array copy step if the element
type is pass-by-value and the array length is neither small enough for a
short header nor large enough to have suffered compression. But that
seems like a very acceptable price for winning in pass-by-ref cases.)
Hence, redesign to take these insights into account. While at it,
convert to an API in which we fill a struct rather than passing a bunch
of pointers to individual output arguments. That will make it less
painful if we ever want further expansion of what get_attstatsslot can
pass back.
It's certainly arguable that this is new development and not something to
push post-feature-freeze. However, I view it as primarily bug-proofing
and therefore something that's better to have sooner not later. Since
we aren't quite at beta phase yet, let's put it in.
Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-14 03:14:39 +08:00
|
|
|
free_attstatsslot(&sslot);
|
2015-07-22 01:54:18 +08:00
|
|
|
ReleaseVariableStats(vardata);
|
|
|
|
|
|
|
|
CLAMP_PROBABILITY(selec);
|
|
|
|
|
|
|
|
PG_RETURN_FLOAT8((float8) selec);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Estimate selectivity of single intquery operator
|
|
|
|
*/
|
|
|
|
static Selectivity
|
|
|
|
int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
|
|
|
|
int nmcelems, float4 minfreq)
|
|
|
|
{
|
|
|
|
Selectivity selec;
|
|
|
|
|
|
|
|
/* since this function recurses, it could be driven to stack overflow */
|
|
|
|
check_stack_depth();
|
|
|
|
|
|
|
|
if (item->type == VAL)
|
|
|
|
{
|
|
|
|
Datum *searchres;
|
|
|
|
|
|
|
|
if (mcelems == NULL)
|
|
|
|
return (Selectivity) DEFAULT_EQ_SEL;
|
|
|
|
|
|
|
|
searchres = (Datum *) bsearch(&item->val, mcelems, nmcelems,
|
|
|
|
sizeof(Datum), compare_val_int4);
|
|
|
|
if (searchres)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The element is in MCELEM. Return precise selectivity (or at
|
|
|
|
* least as precise as ANALYZE could find out).
|
|
|
|
*/
|
|
|
|
selec = mcefreqs[searchres - mcelems];
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The element is not in MCELEM. Punt, but assume that the
|
|
|
|
* selectivity cannot be more than minfreq / 2.
|
|
|
|
*/
|
|
|
|
selec = Min(DEFAULT_EQ_SEL, minfreq / 2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (item->type == OPR)
|
|
|
|
{
|
|
|
|
/* Current query node is an operator */
|
|
|
|
Selectivity s1,
|
|
|
|
s2;
|
|
|
|
|
|
|
|
s1 = int_query_opr_selec(item - 1, mcelems, mcefreqs, nmcelems,
|
|
|
|
minfreq);
|
|
|
|
switch (item->val)
|
|
|
|
{
|
|
|
|
case (int32) '!':
|
|
|
|
selec = 1.0 - s1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case (int32) '&':
|
|
|
|
s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
|
|
|
|
nmcelems, minfreq);
|
|
|
|
selec = s1 * s2;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case (int32) '|':
|
|
|
|
s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
|
|
|
|
nmcelems, minfreq);
|
|
|
|
selec = s1 + s2 - s1 * s2;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
elog(ERROR, "unrecognized operator: %d", item->val);
|
|
|
|
selec = 0; /* keep compiler quiet */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
elog(ERROR, "unrecognized int query item type: %u", item->type);
|
|
|
|
selec = 0; /* keep compiler quiet */
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Clamp intermediate results to stay sane despite roundoff error */
|
|
|
|
CLAMP_PROBABILITY(selec);
|
|
|
|
|
|
|
|
return selec;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Comparison function for binary search in mcelem array.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
compare_val_int4(const void *a, const void *b)
|
|
|
|
{
|
|
|
|
int32 key = *(int32 *) a;
|
|
|
|
const Datum *t = (const Datum *) b;
|
|
|
|
|
|
|
|
return key - DatumGetInt32(*t);
|
|
|
|
}
|