From e80252d424278abf65b624669c8e6b3fe8587cac Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 9 Sep 2014 15:34:10 -0400 Subject: [PATCH] Add width_bucket(anyelement, anyarray). This provides a convenient method of classifying input values into buckets that are not necessarily equal-width. It works on any sortable data type. The choice of function name is a bit debatable, perhaps, but showing that there's a relationship to the SQL standard's width_bucket() function seems more attractive than the other proposals. Petr Jelinek, reviewed by Pavel Stehule --- doc/src/sgml/func.sgml | 33 +++- src/backend/utils/adt/arrayfuncs.c | 243 +++++++++++++++++++++++++++ src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.h | 6 +- src/include/utils/array.h | 1 + src/test/regress/expected/arrays.out | 123 ++++++++++++++ src/test/regress/sql/arrays.sql | 62 +++++++ 7 files changed, 458 insertions(+), 12 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 84e58944ce..3a7cfa93c2 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -901,25 +901,40 @@ width_bucket - width_bucket(op numeric, b1 numeric, b2 numeric, count int) - + width_bucket(operand dp, b1 dp, b2 dp, count int) int - return the bucket to which operand would - be assigned in an equidepth histogram with count - buckets, in the range b1 to b2 + return the bucket number to which operand would + be assigned in a histogram having count equal-width + buckets spanning the range b1 to b2; + returns 0 or count+1 for + an input outside the range width_bucket(5.35, 0.024, 10.06, 5) 3 - width_bucket(op dp, b1 dp, b2 dp, count int) + width_bucket(operand numeric, b1 numeric, b2 numeric, count int) int - return the bucket to which operand would - be assigned in an equidepth histogram with count - buckets, in the range b1 to b2 + return the bucket number to which operand would + be assigned in a histogram having count equal-width + buckets spanning the range b1 to b2; + returns 0 or count+1 for + an input outside the range width_bucket(5.35, 0.024, 10.06, 5) 3 + + + width_bucket(operand anyelement, thresholds anyarray) + int + return the bucket number to which operand would + be assigned given an array listing the lower bounds of the buckets; + returns 0 for an input less than the first lower bound; + the thresholds array must be sorted, + smallest first, or unexpected results will be obtained + width_bucket(now(), array['yesterday', 'today', 'tomorrow']::timestamptz[]) + 2 + diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c index f8e94ec365..fafc852a4f 100644 --- a/src/backend/utils/adt/arrayfuncs.c +++ b/src/backend/utils/adt/arrayfuncs.c @@ -15,8 +15,10 @@ #include "postgres.h" #include +#include #include "access/htup_details.h" +#include "catalog/pg_type.h" #include "funcapi.h" #include "libpq/pqformat.h" #include "utils/array.h" @@ -130,6 +132,15 @@ static ArrayType *array_replace_internal(ArrayType *array, Datum replace, bool replace_isnull, bool remove, Oid collation, FunctionCallInfo fcinfo); +static int width_bucket_array_float8(Datum operand, ArrayType *thresholds); +static int width_bucket_array_fixed(Datum operand, + ArrayType *thresholds, + Oid collation, + TypeCacheEntry *typentry); +static int width_bucket_array_variable(Datum operand, + ArrayType *thresholds, + Oid collation, + TypeCacheEntry *typentry); /* @@ -5502,3 +5513,235 @@ array_replace(PG_FUNCTION_ARGS) fcinfo); PG_RETURN_ARRAYTYPE_P(array); } + +/* + * Implements width_bucket(anyelement, anyarray). + * + * 'thresholds' is an array containing lower bound values for each bucket; + * these must be sorted from smallest to largest, or bogus results will be + * produced. If N thresholds are supplied, the output is from 0 to N: + * 0 is for inputs < first threshold, N is for inputs >= last threshold. + */ +Datum +width_bucket_array(PG_FUNCTION_ARGS) +{ + Datum operand = PG_GETARG_DATUM(0); + ArrayType *thresholds = PG_GETARG_ARRAYTYPE_P(1); + Oid collation = PG_GET_COLLATION(); + Oid element_type = ARR_ELEMTYPE(thresholds); + int result; + + /* Check input */ + if (ARR_NDIM(thresholds) > 1) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("thresholds must be one-dimensional array"))); + + if (array_contains_nulls(thresholds)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("thresholds array must not contain NULLs"))); + + /* We have a dedicated implementation for float8 data */ + if (element_type == FLOAT8OID) + result = width_bucket_array_float8(operand, thresholds); + else + { + TypeCacheEntry *typentry; + + /* Cache information about the input type */ + typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra; + if (typentry == NULL || + typentry->type_id != element_type) + { + typentry = lookup_type_cache(element_type, + TYPECACHE_CMP_PROC_FINFO); + if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("could not identify a comparison function for type %s", + format_type_be(element_type)))); + fcinfo->flinfo->fn_extra = (void *) typentry; + } + + /* + * We have separate implementation paths for fixed- and variable-width + * types, since indexing the array is a lot cheaper in the first case. + */ + if (typentry->typlen > 0) + result = width_bucket_array_fixed(operand, thresholds, + collation, typentry); + else + result = width_bucket_array_variable(operand, thresholds, + collation, typentry); + } + + /* Avoid leaking memory when handed toasted input. */ + PG_FREE_IF_COPY(thresholds, 1); + + PG_RETURN_INT32(result); +} + +/* + * width_bucket_array for float8 data. + */ +static int +width_bucket_array_float8(Datum operand, ArrayType *thresholds) +{ + float8 op = DatumGetFloat8(operand); + float8 *thresholds_data; + int left; + int right; + + /* + * Since we know the array contains no NULLs, we can just index it + * directly. + */ + thresholds_data = (float8 *) ARR_DATA_PTR(thresholds); + + left = 0; + right = ArrayGetNItems(ARR_NDIM(thresholds), ARR_DIMS(thresholds)); + + /* + * If the probe value is a NaN, it's greater than or equal to all possible + * threshold values (including other NaNs), so we need not search. Note + * that this would give the same result as searching even if the array + * contains multiple NaNs (as long as they're correctly sorted), since the + * loop logic will find the rightmost of multiple equal threshold values. + */ + if (isnan(op)) + return right; + + /* Find the bucket */ + while (left < right) + { + int mid = (left + right) / 2; + + if (isnan(thresholds_data[mid]) || op < thresholds_data[mid]) + right = mid; + else + left = mid + 1; + } + + return left; +} + +/* + * width_bucket_array for generic fixed-width data types. + */ +static int +width_bucket_array_fixed(Datum operand, + ArrayType *thresholds, + Oid collation, + TypeCacheEntry *typentry) +{ + char *thresholds_data; + int typlen = typentry->typlen; + bool typbyval = typentry->typbyval; + FunctionCallInfoData locfcinfo; + int left; + int right; + + /* + * Since we know the array contains no NULLs, we can just index it + * directly. + */ + thresholds_data = (char *) ARR_DATA_PTR(thresholds); + + InitFunctionCallInfoData(locfcinfo, &typentry->cmp_proc_finfo, 2, + collation, NULL, NULL); + + /* Find the bucket */ + left = 0; + right = ArrayGetNItems(ARR_NDIM(thresholds), ARR_DIMS(thresholds)); + while (left < right) + { + int mid = (left + right) / 2; + char *ptr; + int32 cmpresult; + + ptr = thresholds_data + mid * typlen; + + locfcinfo.arg[0] = operand; + locfcinfo.arg[1] = fetch_att(ptr, typbyval, typlen); + locfcinfo.argnull[0] = false; + locfcinfo.argnull[1] = false; + locfcinfo.isnull = false; + + cmpresult = DatumGetInt32(FunctionCallInvoke(&locfcinfo)); + + if (cmpresult < 0) + right = mid; + else + left = mid + 1; + } + + return left; +} + +/* + * width_bucket_array for generic variable-width data types. + */ +static int +width_bucket_array_variable(Datum operand, + ArrayType *thresholds, + Oid collation, + TypeCacheEntry *typentry) +{ + char *thresholds_data; + int typlen = typentry->typlen; + bool typbyval = typentry->typbyval; + char typalign = typentry->typalign; + FunctionCallInfoData locfcinfo; + int left; + int right; + + thresholds_data = (char *) ARR_DATA_PTR(thresholds); + + InitFunctionCallInfoData(locfcinfo, &typentry->cmp_proc_finfo, 2, + collation, NULL, NULL); + + /* Find the bucket */ + left = 0; + right = ArrayGetNItems(ARR_NDIM(thresholds), ARR_DIMS(thresholds)); + while (left < right) + { + int mid = (left + right) / 2; + char *ptr; + int i; + int32 cmpresult; + + /* Locate mid'th array element by advancing from left element */ + ptr = thresholds_data; + for (i = left; i < mid; i++) + { + ptr = att_addlength_pointer(ptr, typlen, ptr); + ptr = (char *) att_align_nominal(ptr, typalign); + } + + locfcinfo.arg[0] = operand; + locfcinfo.arg[1] = fetch_att(ptr, typbyval, typlen); + locfcinfo.argnull[0] = false; + locfcinfo.argnull[1] = false; + locfcinfo.isnull = false; + + cmpresult = DatumGetInt32(FunctionCallInvoke(&locfcinfo)); + + if (cmpresult < 0) + right = mid; + else + { + left = mid + 1; + + /* + * Move the thresholds pointer to match new "left" index, so we + * don't have to seek over those elements again. This trick + * ensures we do only O(N) array indexing work, not O(N^2). + */ + ptr = att_addlength_pointer(ptr, typlen, ptr); + thresholds_data = (char *) att_align_nominal(ptr, typalign); + } + } + + return left; +} diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index e1b62a505c..5dc0455477 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201408281 +#define CATALOG_VERSION_NO 201409091 #endif diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 5176ed04dd..bd1b41723c 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -514,7 +514,7 @@ DATA(insert OID = 308 ( float84le PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0 DATA(insert OID = 309 ( float84gt PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0 16 "701 700" _null_ _null_ _null_ _null_ float84gt _null_ _null_ _null_ )); DATA(insert OID = 310 ( float84ge PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0 16 "701 700" _null_ _null_ _null_ _null_ float84ge _null_ _null_ _null_ )); DATA(insert OID = 320 ( width_bucket PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 23 "701 701 701 23" _null_ _null_ _null_ _null_ width_bucket_float8 _null_ _null_ _null_ )); -DESCR("bucket number of operand in equidepth histogram"); +DESCR("bucket number of operand in equal-width histogram"); DATA(insert OID = 311 ( float8 PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 701 "700" _null_ _null_ _null_ _null_ ftod _null_ _null_ _null_ )); DESCR("convert float4 to float8"); @@ -885,6 +885,8 @@ DATA(insert OID = 2334 ( array_agg_finalfn PGNSP PGUID 12 1 0 0 0 f f f f f f DESCR("aggregate final function"); DATA(insert OID = 2335 ( array_agg PGNSP PGUID 12 1 0 0 0 t f f f f f i 1 0 2277 "2283" _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ )); DESCR("concatenate aggregate input into an array"); +DATA(insert OID = 3218 ( width_bucket PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "2283 2277" _null_ _null_ _null_ _null_ width_bucket_array _null_ _null_ _null_ )); +DESCR("bucket number of operand given a sorted array of bucket lower bounds"); DATA(insert OID = 3816 ( array_typanalyze PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ array_typanalyze _null_ _null_ _null_ )); DESCR("array typanalyze"); DATA(insert OID = 3817 ( arraycontsel PGNSP PGUID 12 1 0 0 0 f f f f t f s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ arraycontsel _null_ _null_ _null_ )); @@ -2301,7 +2303,7 @@ DESCR("trunc(x/y)"); DATA(insert OID = 1980 ( numeric_div_trunc PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1700 "1700 1700" _null_ _null_ _null_ _null_ numeric_div_trunc _null_ _null_ _null_ )); DESCR("trunc(x/y)"); DATA(insert OID = 2170 ( width_bucket PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 23 "1700 1700 1700 23" _null_ _null_ _null_ _null_ width_bucket_numeric _null_ _null_ _null_ )); -DESCR("bucket number of operand in equidepth histogram"); +DESCR("bucket number of operand in equal-width histogram"); DATA(insert OID = 1747 ( time_pl_interval PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1083 "1083 1186" _null_ _null_ _null_ _null_ time_pl_interval _null_ _null_ _null_ )); DATA(insert OID = 1748 ( time_mi_interval PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1083 "1083 1186" _null_ _null_ _null_ _null_ time_mi_interval _null_ _null_ _null_ )); diff --git a/src/include/utils/array.h b/src/include/utils/array.h index 9bbfaae85e..e744314c51 100644 --- a/src/include/utils/array.h +++ b/src/include/utils/array.h @@ -214,6 +214,7 @@ extern Datum array_fill_with_lower_bounds(PG_FUNCTION_ARGS); extern Datum array_unnest(PG_FUNCTION_ARGS); extern Datum array_remove(PG_FUNCTION_ARGS); extern Datum array_replace(PG_FUNCTION_ARGS); +extern Datum width_bucket_array(PG_FUNCTION_ARGS); extern Datum array_ref(ArrayType *array, int nSubscripts, int *indx, int arraytyplen, int elmlen, bool elmbyval, char elmalign, diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out index 4286691f92..46eff67b42 100644 --- a/src/test/regress/expected/arrays.out +++ b/src/test/regress/expected/arrays.out @@ -1706,3 +1706,126 @@ select length(md5((f1[1]).c2)) from dest; drop table dest; drop type textandtext; +-- Tests for polymorphic-array form of width_bucket() +-- this exercises the varwidth and float8 code paths +SELECT + op, + width_bucket(op::numeric, ARRAY[1, 3, 5, 10.0]::numeric[]) AS wb_n1, + width_bucket(op::numeric, ARRAY[0, 5.5, 9.99]::numeric[]) AS wb_n2, + width_bucket(op::numeric, ARRAY[-6, -5, 2.0]::numeric[]) AS wb_n3, + width_bucket(op::float8, ARRAY[1, 3, 5, 10.0]::float8[]) AS wb_f1, + width_bucket(op::float8, ARRAY[0, 5.5, 9.99]::float8[]) AS wb_f2, + width_bucket(op::float8, ARRAY[-6, -5, 2.0]::float8[]) AS wb_f3 +FROM (VALUES + (-5.2), + (-0.0000000001), + (0.000000000001), + (1), + (1.99999999999999), + (2), + (2.00000000000001), + (3), + (4), + (4.5), + (5), + (5.5), + (6), + (7), + (8), + (9), + (9.99999999999999), + (10), + (10.0000000000001) +) v(op); + op | wb_n1 | wb_n2 | wb_n3 | wb_f1 | wb_f2 | wb_f3 +------------------+-------+-------+-------+-------+-------+------- + -5.2 | 0 | 0 | 1 | 0 | 0 | 1 + -0.0000000001 | 0 | 0 | 2 | 0 | 0 | 2 + 0.000000000001 | 0 | 1 | 2 | 0 | 1 | 2 + 1 | 1 | 1 | 2 | 1 | 1 | 2 + 1.99999999999999 | 1 | 1 | 2 | 1 | 1 | 2 + 2 | 1 | 1 | 3 | 1 | 1 | 3 + 2.00000000000001 | 1 | 1 | 3 | 1 | 1 | 3 + 3 | 2 | 1 | 3 | 2 | 1 | 3 + 4 | 2 | 1 | 3 | 2 | 1 | 3 + 4.5 | 2 | 1 | 3 | 2 | 1 | 3 + 5 | 3 | 1 | 3 | 3 | 1 | 3 + 5.5 | 3 | 2 | 3 | 3 | 2 | 3 + 6 | 3 | 2 | 3 | 3 | 2 | 3 + 7 | 3 | 2 | 3 | 3 | 2 | 3 + 8 | 3 | 2 | 3 | 3 | 2 | 3 + 9 | 3 | 2 | 3 | 3 | 2 | 3 + 9.99999999999999 | 3 | 3 | 3 | 3 | 3 | 3 + 10 | 4 | 3 | 3 | 4 | 3 | 3 + 10.0000000000001 | 4 | 3 | 3 | 4 | 3 | 3 +(19 rows) + +-- ensure float8 path handles NaN properly +SELECT + op, + width_bucket(op, ARRAY[1, 3, 9, 'NaN', 'NaN']::float8[]) AS wb +FROM (VALUES + (-5.2::float8), + (4::float8), + (77::float8), + ('NaN'::float8) +) v(op); + op | wb +------+---- + -5.2 | 0 + 4 | 2 + 77 | 3 + NaN | 5 +(4 rows) + +-- these exercise the generic fixed-width code path +SELECT + op, + width_bucket(op, ARRAY[1, 3, 5, 10]) AS wb_1 +FROM generate_series(0,11) as op; + op | wb_1 +----+------ + 0 | 0 + 1 | 1 + 2 | 1 + 3 | 2 + 4 | 2 + 5 | 3 + 6 | 3 + 7 | 3 + 8 | 3 + 9 | 3 + 10 | 4 + 11 | 4 +(12 rows) + +SELECT width_bucket(now(), + array['yesterday', 'today', 'tomorrow']::timestamptz[]); + width_bucket +-------------- + 2 +(1 row) + +-- corner cases +SELECT width_bucket(5, ARRAY[3]); + width_bucket +-------------- + 1 +(1 row) + +SELECT width_bucket(5, '{}'); + width_bucket +-------------- + 0 +(1 row) + +-- error cases +SELECT width_bucket('5'::text, ARRAY[3, 4]::integer[]); +ERROR: function width_bucket(text, integer[]) does not exist +LINE 1: SELECT width_bucket('5'::text, ARRAY[3, 4]::integer[]); + ^ +HINT: No function matches the given name and argument types. You might need to add explicit type casts. +SELECT width_bucket(5, ARRAY[3, 4, NULL]); +ERROR: thresholds array must not contain NULLs +SELECT width_bucket(5, ARRAY[ARRAY[1, 2], ARRAY[3, 4]]); +ERROR: thresholds must be one-dimensional array diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql index d9f7cbfa8d..fa8a20ad1c 100644 --- a/src/test/regress/sql/arrays.sql +++ b/src/test/regress/sql/arrays.sql @@ -476,3 +476,65 @@ drop table src; select length(md5((f1[1]).c2)) from dest; drop table dest; drop type textandtext; + +-- Tests for polymorphic-array form of width_bucket() + +-- this exercises the varwidth and float8 code paths +SELECT + op, + width_bucket(op::numeric, ARRAY[1, 3, 5, 10.0]::numeric[]) AS wb_n1, + width_bucket(op::numeric, ARRAY[0, 5.5, 9.99]::numeric[]) AS wb_n2, + width_bucket(op::numeric, ARRAY[-6, -5, 2.0]::numeric[]) AS wb_n3, + width_bucket(op::float8, ARRAY[1, 3, 5, 10.0]::float8[]) AS wb_f1, + width_bucket(op::float8, ARRAY[0, 5.5, 9.99]::float8[]) AS wb_f2, + width_bucket(op::float8, ARRAY[-6, -5, 2.0]::float8[]) AS wb_f3 +FROM (VALUES + (-5.2), + (-0.0000000001), + (0.000000000001), + (1), + (1.99999999999999), + (2), + (2.00000000000001), + (3), + (4), + (4.5), + (5), + (5.5), + (6), + (7), + (8), + (9), + (9.99999999999999), + (10), + (10.0000000000001) +) v(op); + +-- ensure float8 path handles NaN properly +SELECT + op, + width_bucket(op, ARRAY[1, 3, 9, 'NaN', 'NaN']::float8[]) AS wb +FROM (VALUES + (-5.2::float8), + (4::float8), + (77::float8), + ('NaN'::float8) +) v(op); + +-- these exercise the generic fixed-width code path +SELECT + op, + width_bucket(op, ARRAY[1, 3, 5, 10]) AS wb_1 +FROM generate_series(0,11) as op; + +SELECT width_bucket(now(), + array['yesterday', 'today', 'tomorrow']::timestamptz[]); + +-- corner cases +SELECT width_bucket(5, ARRAY[3]); +SELECT width_bucket(5, '{}'); + +-- error cases +SELECT width_bucket('5'::text, ARRAY[3, 4]::integer[]); +SELECT width_bucket(5, ARRAY[3, 4, NULL]); +SELECT width_bucket(5, ARRAY[ARRAY[1, 2], ARRAY[3, 4]]); -- 2.40.0