]> granicus.if.org Git - postgresql/commitdiff
Add selectivity estimation functions for intarray operators.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 21 Jul 2015 17:54:18 +0000 (20:54 +0300)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 21 Jul 2015 17:59:24 +0000 (20:59 +0300)
Uriy Zhuravlev and Alexander Korotkov, reviewed by Jeff Janes, some cleanup
by me.

contrib/intarray/Makefile
contrib/intarray/_int_selfuncs.c [new file with mode: 0644]
contrib/intarray/expected/_int.out
contrib/intarray/intarray--1.0--1.1.sql [new file with mode: 0644]
contrib/intarray/intarray--1.1.sql [moved from contrib/intarray/intarray--1.0.sql with 88% similarity]
contrib/intarray/intarray.control
contrib/intarray/sql/_int.sql

index 920c5b1ba032217db405b5dba5a7128db43cd689..5ea7f2aedf8e57b7dcaa46aa7d7f90fd997ac5ac 100644 (file)
@@ -2,10 +2,10 @@
 
 MODULE_big = _int
 OBJS = _int_bool.o _int_gist.o _int_op.o _int_tool.o \
-       _intbig_gist.o _int_gin.o $(WIN32RES)
+       _intbig_gist.o _int_gin.o _int_selfuncs.o $(WIN32RES)
 
 EXTENSION = intarray
-DATA = intarray--1.0.sql intarray--unpackaged--1.0.sql
+DATA = intarray--1.1.sql intarray--1.0--1.1.sql intarray--unpackaged--1.0.sql
 PGFILEDESC = "intarray - functions and operators for arrays of integers"
 
 REGRESS = _int
diff --git a/contrib/intarray/_int_selfuncs.c b/contrib/intarray/_int_selfuncs.c
new file mode 100644 (file)
index 0000000..2af1c9b
--- /dev/null
@@ -0,0 +1,341 @@
+/*-------------------------------------------------------------------------
+ *
+ * _int_selfuncs.c
+ *       Functions for selectivity estimation of intarray operators
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *       contrib/intarray/_int_selfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "_int.h"
+
+#include "access/htup_details.h"
+#include "catalog/pg_operator.h"
+#include "catalog/pg_statistic.h"
+#include "catalog/pg_type.h"
+#include "utils/selfuncs.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "miscadmin.h"
+
+PG_FUNCTION_INFO_V1(_int_overlap_sel);
+PG_FUNCTION_INFO_V1(_int_contains_sel);
+PG_FUNCTION_INFO_V1(_int_contained_sel);
+PG_FUNCTION_INFO_V1(_int_overlap_joinsel);
+PG_FUNCTION_INFO_V1(_int_contains_joinsel);
+PG_FUNCTION_INFO_V1(_int_contained_joinsel);
+PG_FUNCTION_INFO_V1(_int_matchsel);
+
+Datum          _int_overlap_sel(PG_FUNCTION_ARGS);
+Datum          _int_contains_sel(PG_FUNCTION_ARGS);
+Datum          _int_contained_sel(PG_FUNCTION_ARGS);
+Datum          _int_overlap_joinsel(PG_FUNCTION_ARGS);
+Datum          _int_contains_joinsel(PG_FUNCTION_ARGS);
+Datum          _int_contained_joinsel(PG_FUNCTION_ARGS);
+Datum          _int_matchsel(PG_FUNCTION_ARGS);
+
+
+static Selectivity int_query_opr_selec(ITEM *item, Datum *values, float4 *freqs,
+                                       int nmncelems, float4 minfreq);
+static int     compare_val_int4(const void *a, const void *b);
+
+/*
+ * Wrappers around the default array selectivity estimation functions.
+ *
+ * The default array selectivity operators for the @>, && and @< operators
+ * work fine for integer arrays. However, if we tried to just use arraycontsel
+ * and arracontjoinsel directly as the cost estimator functions for our
+ * operators, they would not work as intended, because they look at the
+ * operator's OID. Our operators behave exactly like the built-in anyarray
+ * versions, but we must tell the cost estimator functions which built-in
+ * operators they correspond to. These wrappers just replace the operator
+ * OID with the corresponding built-in operator's OID, and call the built-in
+ * function.
+ */
+
+Datum
+_int_overlap_sel(PG_FUNCTION_ARGS)
+{
+       PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
+                                                                               PG_GETARG_DATUM(0),
+                                                                         ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
+                                                                               PG_GETARG_DATUM(2),
+                                                                               PG_GETARG_DATUM(3)));
+}
+
+Datum
+_int_contains_sel(PG_FUNCTION_ARGS)
+{
+       PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
+                                                                               PG_GETARG_DATUM(0),
+                                                                        ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
+                                                                               PG_GETARG_DATUM(2),
+                                                                               PG_GETARG_DATUM(3)));
+}
+
+Datum
+_int_contained_sel(PG_FUNCTION_ARGS)
+{
+       PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
+                                                                               PG_GETARG_DATUM(0),
+                                                                       ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
+                                                                               PG_GETARG_DATUM(2),
+                                                                               PG_GETARG_DATUM(3)));
+}
+
+Datum
+_int_overlap_joinsel(PG_FUNCTION_ARGS)
+{
+       PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
+                                                                               PG_GETARG_DATUM(0),
+                                                                         ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
+                                                                               PG_GETARG_DATUM(2),
+                                                                               PG_GETARG_DATUM(3),
+                                                                               PG_GETARG_DATUM(4)));
+}
+
+Datum
+_int_contains_joinsel(PG_FUNCTION_ARGS)
+{
+       PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
+                                                                               PG_GETARG_DATUM(0),
+                                                                        ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
+                                                                               PG_GETARG_DATUM(2),
+                                                                               PG_GETARG_DATUM(3),
+                                                                               PG_GETARG_DATUM(4)));
+}
+
+Datum
+_int_contained_joinsel(PG_FUNCTION_ARGS)
+{
+       PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
+                                                                               PG_GETARG_DATUM(0),
+                                                                       ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
+                                                                               PG_GETARG_DATUM(2),
+                                                                               PG_GETARG_DATUM(3),
+                                                                               PG_GETARG_DATUM(4)));
+}
+
+
+/*
+ * _int_matchsel -- restriction selectivity function for intarray @@ query_int
+ */
+Datum
+_int_matchsel(PG_FUNCTION_ARGS)
+{
+       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+
+       List       *args = (List *) PG_GETARG_POINTER(2);
+       int                     varRelid = PG_GETARG_INT32(3);
+       VariableStatData vardata;
+       Node       *other;
+       bool            varonleft;
+       Selectivity selec;
+       QUERYTYPE  *query;
+       Datum      *mcelems = NULL;
+       float4     *mcefreqs = NULL;
+       int                     nmcelems = 0;
+       float4          minfreq = 0.0;
+       float4          nullfrac = 0.0;
+       Form_pg_statistic stats;
+       Datum      *values = NULL;
+       int                     nvalues = 0;
+       float4     *numbers = NULL;
+       int                     nnumbers = 0;
+
+       /*
+        * If expression is not "variable @@ something" or "something @@ variable"
+        * then punt and return a default estimate.
+        */
+       if (!get_restriction_variable(root, args, varRelid,
+                                                                 &vardata, &other, &varonleft))
+               PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
+
+       /*
+        * Variable should be int[]. We don't support cases where variable is
+        * query_int.
+        */
+       if (vardata.vartype != INT4ARRAYOID)
+               PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
+
+       /*
+        * Can't do anything useful if the something is not a constant, either.
+        */
+       if (!IsA(other, Const))
+       {
+               ReleaseVariableStats(vardata);
+               PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
+       }
+
+       /*
+        * The "@@" operator is strict, so we can cope with NULL right away.
+        */
+       if (((Const *) other)->constisnull)
+       {
+               ReleaseVariableStats(vardata);
+               PG_RETURN_FLOAT8(0.0);
+       }
+
+       /* The caller made sure the const is a query, so get it now */
+       query = DatumGetQueryTypeP(((Const *) other)->constvalue);
+
+       /* Empty query matches nothing */
+       if (query->size == 0)
+       {
+               ReleaseVariableStats(vardata);
+               return (Selectivity) 0.0;
+       }
+
+       /*
+        * Get the statistics for the intarray column.
+        *
+        * We're interested in the Most-Common-Elements list, and the NULL
+        * fraction.
+        */
+       if (HeapTupleIsValid(vardata.statsTuple))
+       {
+               stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
+               nullfrac = stats->stanullfrac;
+
+               /*
+                * For an int4 array, the default array type analyze function will
+                * collect a Most Common Elements list, which is an array of int4s.
+                */
+               if (get_attstatsslot(vardata.statsTuple,
+                                                        INT4OID, -1,
+                                                        STATISTIC_KIND_MCELEM, InvalidOid,
+                                                        NULL,
+                                                        &values, &nvalues,
+                                                        &numbers, &nnumbers))
+               {
+                       /*
+                        * There should be three more Numbers than Values, because the
+                        * last three (for intarray) cells are taken for minimal, maximal
+                        * and nulls frequency. Punt if not.
+                        */
+                       if (nnumbers == nvalues + 3)
+                       {
+                               /* Grab the lowest frequency. */
+                               minfreq = numbers[nnumbers - (nnumbers - nvalues)];
+
+                               mcelems = values;
+                               mcefreqs = numbers;
+                               nmcelems = nvalues;
+                       }
+               }
+       }
+
+       /* Process the logical expression in the query, using the stats */
+       selec = int_query_opr_selec(GETQUERY(query) + query->size - 1,
+                                                               mcelems, mcefreqs, nmcelems, minfreq);
+
+       /* MCE stats count only non-null rows, so adjust for null rows. */
+       selec *= (1.0 - nullfrac);
+
+       free_attstatsslot(INT4OID, values, nvalues, numbers, nnumbers);
+       ReleaseVariableStats(vardata);
+
+       CLAMP_PROBABILITY(selec);
+
+       PG_RETURN_FLOAT8((float8) selec);
+}
+
+/*
+ * Estimate selectivity of single intquery operator
+ */
+static Selectivity
+int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
+                                       int nmcelems, float4 minfreq)
+{
+       Selectivity selec;
+
+       /* since this function recurses, it could be driven to stack overflow */
+       check_stack_depth();
+
+       if (item->type == VAL)
+       {
+               Datum      *searchres;
+
+               if (mcelems == NULL)
+                       return (Selectivity) DEFAULT_EQ_SEL;
+
+               searchres = (Datum *) bsearch(&item->val, mcelems, nmcelems,
+                                                                         sizeof(Datum), compare_val_int4);
+               if (searchres)
+               {
+                       /*
+                        * The element is in MCELEM.  Return precise selectivity (or at
+                        * least as precise as ANALYZE could find out).
+                        */
+                       selec = mcefreqs[searchres - mcelems];
+               }
+               else
+               {
+                       /*
+                        * The element is not in MCELEM.  Punt, but assume that the
+                        * selectivity cannot be more than minfreq / 2.
+                        */
+                       selec = Min(DEFAULT_EQ_SEL, minfreq / 2);
+               }
+       }
+       else if (item->type == OPR)
+       {
+               /* Current query node is an operator */
+               Selectivity s1,
+                                       s2;
+
+               s1 = int_query_opr_selec(item - 1, mcelems, mcefreqs, nmcelems,
+                                                                minfreq);
+               switch (item->val)
+               {
+                       case (int32) '!':
+                               selec = 1.0 - s1;
+                               break;
+
+                       case (int32) '&':
+                               s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
+                                                                                nmcelems, minfreq);
+                               selec = s1 * s2;
+                               break;
+
+                       case (int32) '|':
+                               s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
+                                                                                nmcelems, minfreq);
+                               selec = s1 + s2 - s1 * s2;
+                               break;
+
+                       default:
+                               elog(ERROR, "unrecognized operator: %d", item->val);
+                               selec = 0;              /* keep compiler quiet */
+                               break;
+               }
+       }
+       else
+       {
+               elog(ERROR, "unrecognized int query item type: %u", item->type);
+               selec = 0;                              /* keep compiler quiet */
+       }
+
+       /* Clamp intermediate results to stay sane despite roundoff error */
+       CLAMP_PROBABILITY(selec);
+
+       return selec;
+}
+
+/*
+ * Comparison function for binary search in mcelem array.
+ */
+static int
+compare_val_int4(const void *a, const void *b)
+{
+       int32           key = *(int32 *) a;
+       const Datum *t = (const Datum *) b;
+
+       return key - DatumGetInt32(*t);
+}
index 4080b9633fe98a91861684e0d82f1297c21b91af..962e5c6a4b1ddae2cf7eaa4bf46f41877c45dcde 100644 (file)
@@ -368,6 +368,7 @@ SELECT '1&(2&(4&(5|!6)))'::query_int;
 
 CREATE TABLE test__int( a int[] );
 \copy test__int from 'data/test__int.data'
+ANALYZE test__int;
 SELECT count(*) from test__int WHERE a && '{23,50}';
  count 
 -------
diff --git a/contrib/intarray/intarray--1.0--1.1.sql b/contrib/intarray/intarray--1.0--1.1.sql
new file mode 100644 (file)
index 0000000..fecebdd
--- /dev/null
@@ -0,0 +1,49 @@
+/* contrib/intarray/intarray--1.0--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "ALTER EXTENSION intarray UPDATE TO '1.1'" to load this file. \quit
+
+CREATE FUNCTION _int_matchsel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+ALTER OPERATOR @@ (_int4, query_int) SET (RESTRICT = _int_matchsel);
+ALTER OPERATOR ~~ (query_int, _int4) SET (RESTRICT = _int_matchsel);
+
+CREATE FUNCTION _int_overlap_sel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contains_sel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contained_sel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_overlap_joinsel(internal, oid, internal, smallint, internal)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contains_joinsel(internal, oid, internal, smallint, internal)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contained_joinsel(internal, oid, internal, smallint, internal)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+ALTER OPERATOR && (_int4, _int4) SET (RESTRICT = _int_overlap_sel, JOIN = _int_overlap_joinsel);
+ALTER OPERATOR @> (_int4, _int4) SET (RESTRICT = _int_contains_sel, JOIN = _int_contains_joinsel);
+ALTER OPERATOR <@ (_int4, _int4) SET (RESTRICT = _int_contained_sel, JOIN = _int_contained_joinsel);
+
+ALTER OPERATOR @ (_int4, _int4) SET (RESTRICT = _int_contains_sel, JOIN = _int_contains_joinsel);
+ALTER OPERATOR ~ (_int4, _int4) SET (RESTRICT = _int_contained_sel, JOIN = _int_contained_joinsel);
similarity index 88%
rename from contrib/intarray/intarray--1.0.sql
rename to contrib/intarray/intarray--1.1.sql
index 0b89e0f55e52ac74a41239f70bc4be9ca1ad6a09..817625e54a0fb9f51ef62881f63a75e04f97f799 100644 (file)
@@ -1,4 +1,4 @@
-/* contrib/intarray/intarray--1.0.sql */
+/* contrib/intarray/intarray--1.1.sql */
 
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "CREATE EXTENSION intarray" to load this file. \quit
@@ -45,12 +45,17 @@ LANGUAGE C STRICT IMMUTABLE;
 
 COMMENT ON FUNCTION rboolop(query_int, _int4) IS 'boolean operation with array';
 
+CREATE FUNCTION _int_matchsel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
 CREATE OPERATOR @@ (
        LEFTARG = _int4,
        RIGHTARG = query_int,
        PROCEDURE = boolop,
        COMMUTATOR = '~~',
-       RESTRICT = contsel,
+       RESTRICT = _int_matchsel,
        JOIN = contjoinsel
 );
 
@@ -59,7 +64,7 @@ CREATE OPERATOR ~~ (
        RIGHTARG = _int4,
        PROCEDURE = rboolop,
        COMMUTATOR = '@@',
-       RESTRICT = contsel,
+       RESTRICT = _int_matchsel,
        JOIN = contjoinsel
 );
 
@@ -117,6 +122,36 @@ RETURNS _int4
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT IMMUTABLE;
 
+CREATE FUNCTION _int_overlap_sel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contains_sel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contained_sel(internal, oid, internal, integer)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_overlap_joinsel(internal, oid, internal, smallint, internal)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contains_joinsel(internal, oid, internal, smallint, internal)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
+CREATE FUNCTION _int_contained_joinsel(internal, oid, internal, smallint, internal)
+RETURNS float8
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE;
+
 --
 -- OPERATORS
 --
@@ -126,8 +161,8 @@ CREATE OPERATOR && (
        RIGHTARG = _int4,
        PROCEDURE = _int_overlap,
        COMMUTATOR = '&&',
-       RESTRICT = contsel,
-       JOIN = contjoinsel
+       RESTRICT = _int_overlap_sel,
+       JOIN = _int_overlap_joinsel
 );
 
 --CREATE OPERATOR = (
@@ -157,8 +192,8 @@ CREATE OPERATOR @> (
        RIGHTARG = _int4,
        PROCEDURE = _int_contains,
        COMMUTATOR = '<@',
-       RESTRICT = contsel,
-       JOIN = contjoinsel
+       RESTRICT = _int_contains_sel,
+       JOIN = _int_contains_joinsel
 );
 
 CREATE OPERATOR <@ (
@@ -166,8 +201,8 @@ CREATE OPERATOR <@ (
        RIGHTARG = _int4,
        PROCEDURE = _int_contained,
        COMMUTATOR = '@>',
-       RESTRICT = contsel,
-       JOIN = contjoinsel
+       RESTRICT = _int_contained_sel,
+       JOIN = _int_contained_joinsel
 );
 
 -- obsolete:
@@ -176,8 +211,8 @@ CREATE OPERATOR @ (
        RIGHTARG = _int4,
        PROCEDURE = _int_contains,
        COMMUTATOR = '~',
-       RESTRICT = contsel,
-       JOIN = contjoinsel
+       RESTRICT = _int_contains_sel,
+       JOIN = _int_contains_joinsel
 );
 
 CREATE OPERATOR ~ (
@@ -185,8 +220,8 @@ CREATE OPERATOR ~ (
        RIGHTARG = _int4,
        PROCEDURE = _int_contained,
        COMMUTATOR = '@',
-       RESTRICT = contsel,
-       JOIN = contjoinsel
+       RESTRICT = _int_contained_sel,
+       JOIN = _int_contained_joinsel
 );
 
 --------------
index 7b3d4f78f053adc3c046f77d45f1bdb78cd54498..8c23e8d5e2ed799c6bf9831f6bb5f3fe07d05875 100644 (file)
@@ -1,5 +1,5 @@
 # intarray extension
 comment = 'functions, operators, and index support for 1-D arrays of integers'
-default_version = '1.0'
+default_version = '1.1'
 module_pathname = '$libdir/_int'
 relocatable = true
index 216c5c58d615a7cd1a5fe3714c9a5c91fb255138..f6fe2de55c587b7d0f3f89fb2af764f1deab291d 100644 (file)
@@ -68,8 +68,8 @@ SELECT '1&(2&(4&(5|!6)))'::query_int;
 
 
 CREATE TABLE test__int( a int[] );
-
 \copy test__int from 'data/test__int.data'
+ANALYZE test__int;
 
 SELECT count(*) from test__int WHERE a && '{23,50}';
 SELECT count(*) from test__int WHERE a @@ '23|50';