Add width_bucket(anyelement, anyarray).

This provides a convenient method of classifying input values into buckets
that are not necessarily equal-width.  It works on any sortable data type.

The choice of function name is a bit debatable, perhaps, but showing that
there's a relationship to the SQL standard's width_bucket() function seems
more attractive than the other proposals.

Petr Jelinek, reviewed by Pavel Stehule
This commit is contained in:
Tom Lane 2014-09-09 15:34:10 -04:00
parent 220bb39dee
commit e80252d424
7 changed files with 458 additions and 12 deletions

View File

@ -901,25 +901,40 @@
<indexterm>
<primary>width_bucket</primary>
</indexterm>
<literal><function>width_bucket(<parameter>op</parameter> <type>numeric</type>, <parameter>b1</parameter> <type>numeric</type>, <parameter>b2</parameter> <type>numeric</type>, <parameter>count</parameter> <type>int</type>)</function></literal>
</entry>
<literal><function>width_bucket(<parameter>operand</parameter> <type>dp</type>, <parameter>b1</parameter> <type>dp</type>, <parameter>b2</parameter> <type>dp</type>, <parameter>count</parameter> <type>int</type>)</function></literal></entry>
<entry><type>int</type></entry>
<entry>return the bucket to which <parameter>operand</> would
be assigned in an equidepth histogram with <parameter>count</>
buckets, in the range <parameter>b1</> to <parameter>b2</></entry>
<entry>return the bucket number to which <parameter>operand</> would
be assigned in a histogram having <parameter>count</> equal-width
buckets spanning the range <parameter>b1</> to <parameter>b2</>;
returns <literal>0</> or <literal><parameter>count</>+1</literal> for
an input outside the range</entry>
<entry><literal>width_bucket(5.35, 0.024, 10.06, 5)</literal></entry>
<entry><literal>3</literal></entry>
</row>
<row>
<entry><literal><function>width_bucket(<parameter>op</parameter> <type>dp</type>, <parameter>b1</parameter> <type>dp</type>, <parameter>b2</parameter> <type>dp</type>, <parameter>count</parameter> <type>int</type>)</function></literal></entry>
<entry><literal><function>width_bucket(<parameter>operand</parameter> <type>numeric</type>, <parameter>b1</parameter> <type>numeric</type>, <parameter>b2</parameter> <type>numeric</type>, <parameter>count</parameter> <type>int</type>)</function></literal></entry>
<entry><type>int</type></entry>
<entry>return the bucket to which <parameter>operand</> would
be assigned in an equidepth histogram with <parameter>count</>
buckets, in the range <parameter>b1</> to <parameter>b2</></entry>
<entry>return the bucket number to which <parameter>operand</> would
be assigned in a histogram having <parameter>count</> equal-width
buckets spanning the range <parameter>b1</> to <parameter>b2</>;
returns <literal>0</> or <literal><parameter>count</>+1</literal> for
an input outside the range</entry>
<entry><literal>width_bucket(5.35, 0.024, 10.06, 5)</literal></entry>
<entry><literal>3</literal></entry>
</row>
<row>
<entry><literal><function>width_bucket(<parameter>operand</parameter> <type>anyelement</type>, <parameter>thresholds</parameter> <type>anyarray</type>)</function></literal></entry>
<entry><type>int</type></entry>
<entry>return the bucket number to which <parameter>operand</> would
be assigned given an array listing the lower bounds of the buckets;
returns <literal>0</> for an input less than the first lower bound;
the <parameter>thresholds</> array <emphasis>must be sorted</>,
smallest first, or unexpected results will be obtained</entry>
<entry><literal>width_bucket(now(), array['yesterday', 'today', 'tomorrow']::timestamptz[])</literal></entry>
<entry><literal>2</literal></entry>
</row>
</tbody>
</tgroup>
</table>

View File

@ -15,8 +15,10 @@
#include "postgres.h"
#include <ctype.h>
#include <math.h>
#include "access/htup_details.h"
#include "catalog/pg_type.h"
#include "funcapi.h"
#include "libpq/pqformat.h"
#include "utils/array.h"
@ -130,6 +132,15 @@ static ArrayType *array_replace_internal(ArrayType *array,
Datum replace, bool replace_isnull,
bool remove, Oid collation,
FunctionCallInfo fcinfo);
static int width_bucket_array_float8(Datum operand, ArrayType *thresholds);
static int width_bucket_array_fixed(Datum operand,
ArrayType *thresholds,
Oid collation,
TypeCacheEntry *typentry);
static int width_bucket_array_variable(Datum operand,
ArrayType *thresholds,
Oid collation,
TypeCacheEntry *typentry);
/*
@ -5502,3 +5513,235 @@ array_replace(PG_FUNCTION_ARGS)
fcinfo);
PG_RETURN_ARRAYTYPE_P(array);
}
/*
* Implements width_bucket(anyelement, anyarray).
*
* 'thresholds' is an array containing lower bound values for each bucket;
* these must be sorted from smallest to largest, or bogus results will be
* produced. If N thresholds are supplied, the output is from 0 to N:
* 0 is for inputs < first threshold, N is for inputs >= last threshold.
*/
Datum
width_bucket_array(PG_FUNCTION_ARGS)
{
Datum operand = PG_GETARG_DATUM(0);
ArrayType *thresholds = PG_GETARG_ARRAYTYPE_P(1);
Oid collation = PG_GET_COLLATION();
Oid element_type = ARR_ELEMTYPE(thresholds);
int result;
/* Check input */
if (ARR_NDIM(thresholds) > 1)
ereport(ERROR,
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
errmsg("thresholds must be one-dimensional array")));
if (array_contains_nulls(thresholds))
ereport(ERROR,
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("thresholds array must not contain NULLs")));
/* We have a dedicated implementation for float8 data */
if (element_type == FLOAT8OID)
result = width_bucket_array_float8(operand, thresholds);
else
{
TypeCacheEntry *typentry;
/* Cache information about the input type */
typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
if (typentry == NULL ||
typentry->type_id != element_type)
{
typentry = lookup_type_cache(element_type,
TYPECACHE_CMP_PROC_FINFO);
if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_FUNCTION),
errmsg("could not identify a comparison function for type %s",
format_type_be(element_type))));
fcinfo->flinfo->fn_extra = (void *) typentry;
}
/*
* We have separate implementation paths for fixed- and variable-width
* types, since indexing the array is a lot cheaper in the first case.
*/
if (typentry->typlen > 0)
result = width_bucket_array_fixed(operand, thresholds,
collation, typentry);
else
result = width_bucket_array_variable(operand, thresholds,
collation, typentry);
}
/* Avoid leaking memory when handed toasted input. */
PG_FREE_IF_COPY(thresholds, 1);
PG_RETURN_INT32(result);
}
/*
* width_bucket_array for float8 data.
*/
static int
width_bucket_array_float8(Datum operand, ArrayType *thresholds)
{
float8 op = DatumGetFloat8(operand);
float8 *thresholds_data;
int left;
int right;
/*
* Since we know the array contains no NULLs, we can just index it
* directly.
*/
thresholds_data = (float8 *) ARR_DATA_PTR(thresholds);
left = 0;
right = ArrayGetNItems(ARR_NDIM(thresholds), ARR_DIMS(thresholds));
/*
* If the probe value is a NaN, it's greater than or equal to all possible
* threshold values (including other NaNs), so we need not search. Note
* that this would give the same result as searching even if the array
* contains multiple NaNs (as long as they're correctly sorted), since the
* loop logic will find the rightmost of multiple equal threshold values.
*/
if (isnan(op))
return right;
/* Find the bucket */
while (left < right)
{
int mid = (left + right) / 2;
if (isnan(thresholds_data[mid]) || op < thresholds_data[mid])
right = mid;
else
left = mid + 1;
}
return left;
}
/*
* width_bucket_array for generic fixed-width data types.
*/
static int
width_bucket_array_fixed(Datum operand,
ArrayType *thresholds,
Oid collation,
TypeCacheEntry *typentry)
{
char *thresholds_data;
int typlen = typentry->typlen;
bool typbyval = typentry->typbyval;
FunctionCallInfoData locfcinfo;
int left;
int right;
/*
* Since we know the array contains no NULLs, we can just index it
* directly.
*/
thresholds_data = (char *) ARR_DATA_PTR(thresholds);
InitFunctionCallInfoData(locfcinfo, &typentry->cmp_proc_finfo, 2,
collation, NULL, NULL);
/* Find the bucket */
left = 0;
right = ArrayGetNItems(ARR_NDIM(thresholds), ARR_DIMS(thresholds));
while (left < right)
{
int mid = (left + right) / 2;
char *ptr;
int32 cmpresult;
ptr = thresholds_data + mid * typlen;
locfcinfo.arg[0] = operand;
locfcinfo.arg[1] = fetch_att(ptr, typbyval, typlen);
locfcinfo.argnull[0] = false;
locfcinfo.argnull[1] = false;
locfcinfo.isnull = false;
cmpresult = DatumGetInt32(FunctionCallInvoke(&locfcinfo));
if (cmpresult < 0)
right = mid;
else
left = mid + 1;
}
return left;
}
/*
* width_bucket_array for generic variable-width data types.
*/
static int
width_bucket_array_variable(Datum operand,
ArrayType *thresholds,
Oid collation,
TypeCacheEntry *typentry)
{
char *thresholds_data;
int typlen = typentry->typlen;
bool typbyval = typentry->typbyval;
char typalign = typentry->typalign;
FunctionCallInfoData locfcinfo;
int left;
int right;
thresholds_data = (char *) ARR_DATA_PTR(thresholds);
InitFunctionCallInfoData(locfcinfo, &typentry->cmp_proc_finfo, 2,
collation, NULL, NULL);
/* Find the bucket */
left = 0;
right = ArrayGetNItems(ARR_NDIM(thresholds), ARR_DIMS(thresholds));
while (left < right)
{
int mid = (left + right) / 2;
char *ptr;
int i;
int32 cmpresult;
/* Locate mid'th array element by advancing from left element */
ptr = thresholds_data;
for (i = left; i < mid; i++)
{
ptr = att_addlength_pointer(ptr, typlen, ptr);
ptr = (char *) att_align_nominal(ptr, typalign);
}
locfcinfo.arg[0] = operand;
locfcinfo.arg[1] = fetch_att(ptr, typbyval, typlen);
locfcinfo.argnull[0] = false;
locfcinfo.argnull[1] = false;
locfcinfo.isnull = false;
cmpresult = DatumGetInt32(FunctionCallInvoke(&locfcinfo));
if (cmpresult < 0)
right = mid;
else
{
left = mid + 1;
/*
* Move the thresholds pointer to match new "left" index, so we
* don't have to seek over those elements again. This trick
* ensures we do only O(N) array indexing work, not O(N^2).
*/
ptr = att_addlength_pointer(ptr, typlen, ptr);
thresholds_data = (char *) att_align_nominal(ptr, typalign);
}
}
return left;
}

View File

@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 201408281
#define CATALOG_VERSION_NO 201409091
#endif

View File

@ -514,7 +514,7 @@ DATA(insert OID = 308 ( float84le PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0
DATA(insert OID = 309 ( float84gt PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0 16 "701 700" _null_ _null_ _null_ _null_ float84gt _null_ _null_ _null_ ));
DATA(insert OID = 310 ( float84ge PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0 16 "701 700" _null_ _null_ _null_ _null_ float84ge _null_ _null_ _null_ ));
DATA(insert OID = 320 ( width_bucket PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 23 "701 701 701 23" _null_ _null_ _null_ _null_ width_bucket_float8 _null_ _null_ _null_ ));
DESCR("bucket number of operand in equidepth histogram");
DESCR("bucket number of operand in equal-width histogram");
DATA(insert OID = 311 ( float8 PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 701 "700" _null_ _null_ _null_ _null_ ftod _null_ _null_ _null_ ));
DESCR("convert float4 to float8");
@ -885,6 +885,8 @@ DATA(insert OID = 2334 ( array_agg_finalfn PGNSP PGUID 12 1 0 0 0 f f f f f f
DESCR("aggregate final function");
DATA(insert OID = 2335 ( array_agg PGNSP PGUID 12 1 0 0 0 t f f f f f i 1 0 2277 "2283" _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
DESCR("concatenate aggregate input into an array");
DATA(insert OID = 3218 ( width_bucket PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "2283 2277" _null_ _null_ _null_ _null_ width_bucket_array _null_ _null_ _null_ ));
DESCR("bucket number of operand given a sorted array of bucket lower bounds");
DATA(insert OID = 3816 ( array_typanalyze PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ array_typanalyze _null_ _null_ _null_ ));
DESCR("array typanalyze");
DATA(insert OID = 3817 ( arraycontsel PGNSP PGUID 12 1 0 0 0 f f f f t f s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ arraycontsel _null_ _null_ _null_ ));
@ -2301,7 +2303,7 @@ DESCR("trunc(x/y)");
DATA(insert OID = 1980 ( numeric_div_trunc PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1700 "1700 1700" _null_ _null_ _null_ _null_ numeric_div_trunc _null_ _null_ _null_ ));
DESCR("trunc(x/y)");
DATA(insert OID = 2170 ( width_bucket PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 23 "1700 1700 1700 23" _null_ _null_ _null_ _null_ width_bucket_numeric _null_ _null_ _null_ ));
DESCR("bucket number of operand in equidepth histogram");
DESCR("bucket number of operand in equal-width histogram");
DATA(insert OID = 1747 ( time_pl_interval PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1083 "1083 1186" _null_ _null_ _null_ _null_ time_pl_interval _null_ _null_ _null_ ));
DATA(insert OID = 1748 ( time_mi_interval PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1083 "1083 1186" _null_ _null_ _null_ _null_ time_mi_interval _null_ _null_ _null_ ));

View File

@ -214,6 +214,7 @@ extern Datum array_fill_with_lower_bounds(PG_FUNCTION_ARGS);
extern Datum array_unnest(PG_FUNCTION_ARGS);
extern Datum array_remove(PG_FUNCTION_ARGS);
extern Datum array_replace(PG_FUNCTION_ARGS);
extern Datum width_bucket_array(PG_FUNCTION_ARGS);
extern Datum array_ref(ArrayType *array, int nSubscripts, int *indx,
int arraytyplen, int elmlen, bool elmbyval, char elmalign,

View File

@ -1706,3 +1706,126 @@ select length(md5((f1[1]).c2)) from dest;
drop table dest;
drop type textandtext;
-- Tests for polymorphic-array form of width_bucket()
-- this exercises the varwidth and float8 code paths
SELECT
op,
width_bucket(op::numeric, ARRAY[1, 3, 5, 10.0]::numeric[]) AS wb_n1,
width_bucket(op::numeric, ARRAY[0, 5.5, 9.99]::numeric[]) AS wb_n2,
width_bucket(op::numeric, ARRAY[-6, -5, 2.0]::numeric[]) AS wb_n3,
width_bucket(op::float8, ARRAY[1, 3, 5, 10.0]::float8[]) AS wb_f1,
width_bucket(op::float8, ARRAY[0, 5.5, 9.99]::float8[]) AS wb_f2,
width_bucket(op::float8, ARRAY[-6, -5, 2.0]::float8[]) AS wb_f3
FROM (VALUES
(-5.2),
(-0.0000000001),
(0.000000000001),
(1),
(1.99999999999999),
(2),
(2.00000000000001),
(3),
(4),
(4.5),
(5),
(5.5),
(6),
(7),
(8),
(9),
(9.99999999999999),
(10),
(10.0000000000001)
) v(op);
op | wb_n1 | wb_n2 | wb_n3 | wb_f1 | wb_f2 | wb_f3
------------------+-------+-------+-------+-------+-------+-------
-5.2 | 0 | 0 | 1 | 0 | 0 | 1
-0.0000000001 | 0 | 0 | 2 | 0 | 0 | 2
0.000000000001 | 0 | 1 | 2 | 0 | 1 | 2
1 | 1 | 1 | 2 | 1 | 1 | 2
1.99999999999999 | 1 | 1 | 2 | 1 | 1 | 2
2 | 1 | 1 | 3 | 1 | 1 | 3
2.00000000000001 | 1 | 1 | 3 | 1 | 1 | 3
3 | 2 | 1 | 3 | 2 | 1 | 3
4 | 2 | 1 | 3 | 2 | 1 | 3
4.5 | 2 | 1 | 3 | 2 | 1 | 3
5 | 3 | 1 | 3 | 3 | 1 | 3
5.5 | 3 | 2 | 3 | 3 | 2 | 3
6 | 3 | 2 | 3 | 3 | 2 | 3
7 | 3 | 2 | 3 | 3 | 2 | 3
8 | 3 | 2 | 3 | 3 | 2 | 3
9 | 3 | 2 | 3 | 3 | 2 | 3
9.99999999999999 | 3 | 3 | 3 | 3 | 3 | 3
10 | 4 | 3 | 3 | 4 | 3 | 3
10.0000000000001 | 4 | 3 | 3 | 4 | 3 | 3
(19 rows)
-- ensure float8 path handles NaN properly
SELECT
op,
width_bucket(op, ARRAY[1, 3, 9, 'NaN', 'NaN']::float8[]) AS wb
FROM (VALUES
(-5.2::float8),
(4::float8),
(77::float8),
('NaN'::float8)
) v(op);
op | wb
------+----
-5.2 | 0
4 | 2
77 | 3
NaN | 5
(4 rows)
-- these exercise the generic fixed-width code path
SELECT
op,
width_bucket(op, ARRAY[1, 3, 5, 10]) AS wb_1
FROM generate_series(0,11) as op;
op | wb_1
----+------
0 | 0
1 | 1
2 | 1
3 | 2
4 | 2
5 | 3
6 | 3
7 | 3
8 | 3
9 | 3
10 | 4
11 | 4
(12 rows)
SELECT width_bucket(now(),
array['yesterday', 'today', 'tomorrow']::timestamptz[]);
width_bucket
--------------
2
(1 row)
-- corner cases
SELECT width_bucket(5, ARRAY[3]);
width_bucket
--------------
1
(1 row)
SELECT width_bucket(5, '{}');
width_bucket
--------------
0
(1 row)
-- error cases
SELECT width_bucket('5'::text, ARRAY[3, 4]::integer[]);
ERROR: function width_bucket(text, integer[]) does not exist
LINE 1: SELECT width_bucket('5'::text, ARRAY[3, 4]::integer[]);
^
HINT: No function matches the given name and argument types. You might need to add explicit type casts.
SELECT width_bucket(5, ARRAY[3, 4, NULL]);
ERROR: thresholds array must not contain NULLs
SELECT width_bucket(5, ARRAY[ARRAY[1, 2], ARRAY[3, 4]]);
ERROR: thresholds must be one-dimensional array

View File

@ -476,3 +476,65 @@ drop table src;
select length(md5((f1[1]).c2)) from dest;
drop table dest;
drop type textandtext;
-- Tests for polymorphic-array form of width_bucket()
-- this exercises the varwidth and float8 code paths
SELECT
op,
width_bucket(op::numeric, ARRAY[1, 3, 5, 10.0]::numeric[]) AS wb_n1,
width_bucket(op::numeric, ARRAY[0, 5.5, 9.99]::numeric[]) AS wb_n2,
width_bucket(op::numeric, ARRAY[-6, -5, 2.0]::numeric[]) AS wb_n3,
width_bucket(op::float8, ARRAY[1, 3, 5, 10.0]::float8[]) AS wb_f1,
width_bucket(op::float8, ARRAY[0, 5.5, 9.99]::float8[]) AS wb_f2,
width_bucket(op::float8, ARRAY[-6, -5, 2.0]::float8[]) AS wb_f3
FROM (VALUES
(-5.2),
(-0.0000000001),
(0.000000000001),
(1),
(1.99999999999999),
(2),
(2.00000000000001),
(3),
(4),
(4.5),
(5),
(5.5),
(6),
(7),
(8),
(9),
(9.99999999999999),
(10),
(10.0000000000001)
) v(op);
-- ensure float8 path handles NaN properly
SELECT
op,
width_bucket(op, ARRAY[1, 3, 9, 'NaN', 'NaN']::float8[]) AS wb
FROM (VALUES
(-5.2::float8),
(4::float8),
(77::float8),
('NaN'::float8)
) v(op);
-- these exercise the generic fixed-width code path
SELECT
op,
width_bucket(op, ARRAY[1, 3, 5, 10]) AS wb_1
FROM generate_series(0,11) as op;
SELECT width_bucket(now(),
array['yesterday', 'today', 'tomorrow']::timestamptz[]);
-- corner cases
SELECT width_bucket(5, ARRAY[3]);
SELECT width_bucket(5, '{}');
-- error cases
SELECT width_bucket('5'::text, ARRAY[3, 4]::integer[]);
SELECT width_bucket(5, ARRAY[3, 4, NULL]);
SELECT width_bucket(5, ARRAY[ARRAY[1, 2], ARRAY[3, 4]]);