]> granicus.if.org Git - postgis/commitdiff
added skeleton functions for pg75 stats integration
authorSandro Santilli <strk@keybit.net>
Mon, 23 Feb 2004 12:18:55 +0000 (12:18 +0000)
committerSandro Santilli <strk@keybit.net>
Mon, 23 Feb 2004 12:18:55 +0000 (12:18 +0000)
git-svn-id: http://svn.osgeo.org/postgis/trunk@448 b70326c6-7e19-0410-871a-916f4a2858ee

Attic/postgis_sql_75_end.sql.in
Attic/postgis_sql_75_start.sql.in
postgis.h
postgis_estimate.c

index cca969f925ff82f0d260a07540bcf4640d2bcbd2..989a65f7e8a4ec91f8cbebe65ac5ba3ecb80317e 100644 (file)
@@ -12,6 +12,9 @@
 --  
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 -- $Log$
+-- Revision 1.2  2004/02/23 12:18:55  strk
+-- added skeleton functions for pg75 stats integration
+--
 -- Revision 1.1  2004/02/04 22:27:02  strk
 -- Added pg75 support scripts
 --
@@ -157,11 +160,6 @@ CREATE FUNCTION estimate_histogram2d(histogram2d,box)
        AS '@MODULE_FILENAME@','estimate_histogram2d'
        LANGUAGE 'C'  with (isstrict);
 
-CREATE FUNCTION postgisgistcostestimate(internal,internal,internal,internal,internal,internal,internal,internal)
-       RETURNS opaque
-       AS '@MODULE_FILENAME@','postgisgistcostestimate'
-       LANGUAGE 'C'  with (isstrict);
-
 --
 -- 7.2 GiST support functions
 --
index 1d1c54827b42e65c71161e6c4d86fd9f4279558a..2f76df9d2dd5083ce9228aecff7857774e22aef0 100644 (file)
@@ -12,6 +12,9 @@
 --  
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 -- $Log$
+-- Revision 1.2  2004/02/23 12:18:55  strk
+-- added skeleton functions for pg75 stats integration
+--
 -- Revision 1.1  2004/02/04 22:27:02  strk
 -- Added pg75 support scripts
 --
@@ -36,11 +39,11 @@ BEGIN TRANSACTION;
 -- if it is, you get a message "PL/PgSQL is installed" 
 -- otherwise it will give a big error message.
 
-CREATE FUNCTION plpgsql_call_handler() RETURNS language_handler AS
-    '$libdir/plpgsql' LANGUAGE C;
+--CREATE FUNCTION plpgsql_call_handler() RETURNS language_handler AS
+--    '$libdir/plpgsql' LANGUAGE C;
 
-CREATE TRUSTED PROCEDURAL LANGUAGE plpgsql
-    HANDLER plpgsql_call_handler;
+--CREATE TRUSTED PROCEDURAL LANGUAGE plpgsql
+--    HANDLER plpgsql_call_handler;
 
 --(select 'PL/PgSQL is installed.' as message from pg_language where lanname='plpgsql') union (select 'You must install PL/PgSQL before running this SQL file,\nor you will get an error. To install PL/PgSQL run:\n\tcreatelang plpgsql <dbname>'::text as message) order by message limit 1;
 
@@ -158,12 +161,18 @@ CREATE FUNCTION geometry_out(geometry)
        AS '@MODULE_FILENAME@'
        LANGUAGE 'C' WITH (isstrict);
 
+CREATE FUNCTION geometry_analyze(internal)
+       RETURNS bool
+       AS '@MODULE_FILENAME@'
+       LANGUAGE 'C' WITH (isstrict);
+
 CREATE TYPE geometry (
        alignment = double,
        internallength = variable,
        input = geometry_in,
        output = geometry_out,
-       storage = main
+       storage = main,
+       analyze = geometry_analyze
 );
 
 --
index 001c292a4aaf075357bc85e2ad3d26ff02d7c80d..9d3080001b03abc9a7f86144b085bc2c9c4c81a2 100644 (file)
--- a/postgis.h
+++ b/postgis.h
@@ -11,6 +11,9 @@
  *
  **********************************************************************
  * $Log$
+ * Revision 1.42  2004/02/23 12:18:55  strk
+ * added skeleton functions for pg75 stats integration
+ *
  * Revision 1.41  2004/01/25 19:33:00  pramsey
  * Test commit on new CVS archive.
  *
@@ -618,6 +621,9 @@ Datum geometry2box(PG_FUNCTION_ARGS);
 
 Datum explode_histogram2d(PG_FUNCTION_ARGS);
 Datum estimate_histogram2d(PG_FUNCTION_ARGS);
+#if USE_VERSION >= 75
+Datum geometry_analyze(PG_FUNCTION_ARGS);
+#endif
 
 Datum postgisgistcostestimate(PG_FUNCTION_ARGS);
 
index e8d81b53955dc2f514e9ba1f8352ba20e865de0e..3398a5ef92f00508cd9c5047a6ffbec21a6f6ab2 100644 (file)
@@ -11,6 +11,9 @@
  * 
  **********************************************************************
  * $Log$
+ * Revision 1.8  2004/02/23 12:18:55  strk
+ * added skeleton functions for pg75 stats integration
+ *
  * Revision 1.7  2003/11/11 10:14:57  strk
  * Added support for PG74
  *
 
 #include "executor/spi.h"
 
+#if USE_VERSION >= 75
+
+#define DEBUG_GEOMETRY_STATS 1
+
+#include "commands/vacuum.h"
+
+/*
+ *     Assign a number to the postgis statistics kind
+ *
+ *     tgl suggested:
+ *
+ *     1-100:  reserved for assignment by the core Postgres project
+ *     100-199: reserved for assignment by PostGIS
+ *     200-9999: reserved for other globally-known stats kinds
+ *     10000-32767: reserved for private site-local use
+ *
+ */
+#define STATISTIC_KIND_GEOMETRY 100
+
+typedef struct GEOM_STATS_T
+{
+       //boxesPerSide * boxesPerSide = total boxes in grid
+       float4 boxesPerSide;
+       // average bbox area of features in this histogram
+       float4 avgFeatureArea;
+       // BOX of area
+       float4 xmin,ymin, xmax, ymax;
+       // variable length # of floats for histogram
+       float4 value[1];
+} GEOM_STATS;
+
+#endif
+
 
 //estimate_histogram2d(histogram2d, box)
 // returns a % estimate of the # of features that will be returned by that box query
@@ -575,6 +611,7 @@ get_restriction_var(List *args,
 
 
 
+#if USE_VERSION < 75
 //restriction in the GiST && operator
 PG_FUNCTION_INFO_V1(postgis_gist_sel);
 Datum postgis_gist_sel(PG_FUNCTION_ARGS)
@@ -865,4 +902,364 @@ postgisgistcostestimate(PG_FUNCTION_ARGS)
     PG_RETURN_VOID();
 }
 
+#else // USE_VERSION >= 75
+
+/*
+ * This function should return an estimation of the number of
+ * rows returned by a query involving an overlap check 
+ * ( it's the restrict function for the && operator )
+ *
+ * It can make use (if available) of the statistics collected
+ * by the geometry analyzer function.
+ *
+ */
+PG_FUNCTION_INFO_V1(postgis_gist_sel);
+Datum postgis_gist_sel(PG_FUNCTION_ARGS)
+{
+       Query *root = (Query *) PG_GETARG_POINTER(0);
+       //      Oid                     operator = PG_GETARG_OID(1);
+       List *args = (List *) PG_GETARG_POINTER(2);
+       int varRelid = PG_GETARG_INT32(3);
+       GEOMETRY *in;
+       BOX *search_box;
+       char sql[1000];
+       SPITupleTable *tuptable;
+       TupleDesc tupdesc ;
+       HeapTuple tuple ;
+       Datum datum;
+       bool isnull;
+       Var *var;
+       Node *other;
+       bool varonleft;
+       Oid relid;
+       int SPIcode;
+       double myest;
+
+#if DEBUG_GEOMETRY_STATS
+       elog(NOTICE, "postgis_gist_sel called");
+       elog(NOTICE, " returning a fixed estimate of 0.000005%%");
+#endif
+       PG_RETURN_FLOAT8(0.000005);
+
+       if (!get_restriction_var(args, varRelid,
+               &var, &other, &varonleft))
+       {
+               //elog(NOTICE,"get_restriction_var FAILED -returning early");
+               PG_RETURN_FLOAT8(0.000005);
+       }
+
+       relid = getrelid(var->varno, root->rtable);
+       if (relid == InvalidOid)
+       {
+               //elog(NOTICE,"getrelid FAILED (invalid oid) -returning early");
+               PG_RETURN_FLOAT8(0.000005);
+       }
+
+       //elog(NOTICE,"operator's oid = %i (this should be GEOMETRY && GEOMETRY)",operator);
+       //elog(NOTICE,"relations' oid = %i (this should be the relation that the && is working on) ",relid);
+       //elog(NOTICE,"varatt oid = %i (basically relations column #) ",var->varattno);
+
+
+       if (IsA(other, Const) &&((Const *) other)->constisnull)
+       {
+               //elog(NOTICE,"other operand of && is NULL - returning early");
+               PG_RETURN_FLOAT8(0.000005);
+       }
+
+       if (IsA(other, Const))
+       {
+               //elog(NOTICE,"The other side of the && is a constant with type (oid) = %i and length %i.  This should be GEOMETRY with length -1 (variable length)",((Const*)other)->consttype,((Const*)other)->constlen);
+       }
+       else
+       {
+               //elog(NOTICE,"the other side of && isnt a constant - returning early");
+               PG_RETURN_FLOAT8(0.000005);
+       }
+
+       //get the BOX thats being searched in
+       in = (GEOMETRY*)PG_DETOAST_DATUM( ((Const*)other)->constvalue );
+       search_box = convert_box3d_to_box(&in->bvol);
+
+       //elog(NOTICE,"requested search box is : (%.15g %.15g, %.15g %.15g)",search_box->low.x,search_box->low.y,search_box->high.x,search_box->high.y);
+
+
+       SPIcode = SPI_connect();
+       if (SPIcode  != SPI_OK_CONNECT)
+       {
+               elog(NOTICE,"postgis_gist_sel: couldnt open a connection to SPI:%i",SPIcode);
+               PG_RETURN_FLOAT8(0.000005) ;
+       }
+
+       sprintf(sql,"SELECT stats FROM GEOMETRY_COLUMNS WHERE attrelid=%u AND varattnum=%i",relid,var->varattno);
+       //elog(NOTICE,"sql:%s",sql);
+       SPIcode = SPI_exec(sql, 1 );
+       if (SPIcode  != SPI_OK_SELECT )
+       {
+               SPI_finish();
+               elog(NOTICE,"postgis_gist_sel: couldnt execute sql via SPI");
+               PG_RETURN_FLOAT8(0.000005) ;
+       }
+
+       if (SPI_processed !=1)
+       {
+               SPI_finish();
+               //elog(NOTICE,"postgis_gist_sel: geometry_columns didnt return a unique value");
+               PG_RETURN_FLOAT8(0.000005) ;
+       }
+
+       tuptable = SPI_tuptable;
+       tupdesc = SPI_tuptable->tupdesc;
+       tuple = tuptable->vals[0];
+       datum = SPI_getbinval(tuple, tupdesc, 1, &isnull);
+       if (isnull)
+       {
+               SPI_finish();
+               //elog(NOTICE,"postgis_gist_sel: geometry_columns returned a null histogram");
+               PG_RETURN_FLOAT8(0.000005) ;
+       }
+
+       //elog(NOTICE,"postgis_gist_sel: checking against estimate_histogram2d");
+       // now we have the histogram, and our search box - use the estimate_histogram2d(histo,box) to get the result!
+       myest = DatumGetFloat8( DirectFunctionCall2( estimate_histogram2d, datum, PointerGetDatum(search_box) ) );
+
+       if ( (myest<0) || (myest!=myest) ) // <0?  or NaN?
+       {
+               //elog(NOTICE,"postgis_gist_sel: got something crazy back from estimate_histogram2d");
+               PG_RETURN_FLOAT8(0.000005) ;
+       }
+
+       SPIcode =SPI_finish();
+       if (SPIcode  != SPI_OK_FINISH )
+       {
+               //elog(NOTICE,"postgis_gist_sel: couldnt disconnect from SPI");
+               PG_RETURN_FLOAT8(0.000005) ;
+       }
+       //elog(NOTICE,"postgis_gist_sel: finished, returning with %lf",myest);
+       PG_RETURN_FLOAT8(myest);
+}
+
+/*
+ * This function is called by the analyze function iff
+ * the geometry_analyze() function give it its pointer
+ * (this is always the case so far).
+ * The geometry_analyze() function is also responsible
+ * of deciding the number of "sample" rows we will receive
+ * here. It is able to give use other 'custom' data, but we
+ * won't use them so far.
+ *
+ * Our job is to build some statistics on the sample data
+ * for use by operator estimators.
+ *
+ * Currently we only need statistics to estimate the number of rows
+ * overlapping a given extent (estimation function bound
+ * to the && operator).
+ *
+ */
+static void
+compute_geometry_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
+               int samplerows, double totalrows)
+{
+       MemoryContext old_context;
+       int i;
+       int size;
+       BOX **sampleboxes;
+       GEOM_STATS *geomstats;
+       bool isnull;
+       int null_cnt=0, notnull_cnt=0;
+       BOX3D *sample_extent=NULL;
+       double total_width=0;
+       double total_boxes_area=0;
+       /*
+        * This is where geometry_analyze
+        * should put its' custom parameters.
+        * boxesPerSide would be a good candidate.
+        */
+       //void *mystats = stats->extra_data;
+       int boxesPerSide = 40; 
+
+
+
+#if DEBUG_GEOMETRY_STATS
+       elog(NOTICE, "compute_geometry_stats called");
+       elog(NOTICE, " samplerows: %d", samplerows);
+#endif
+
+       sampleboxes = palloc(sizeof(BOX *)*samplerows);
+
+       /*
+        * First scan:
+        *  o find extent of the sample rows
+        *  o count null/not-null values
+        *  o compute total_width
+        *  o compute total features's box area (for avgFeatureArea)
+        */
+       for (i=0; i<samplerows; i++)
+       {
+               Datum datum;
+               GEOMETRY *geom;
+               BOX *box;
+
+               datum = fetchfunc(stats, i, &isnull);
+
+               if ( isnull ) {
+                       null_cnt++;
+                       sampleboxes[i] = NULL;
+                       continue;
+               }
+               notnull_cnt++;
+
+               geom = (GEOMETRY *) PG_DETOAST_DATUM(datum);
+               box = (BOX *)&(geom->bvol);
+               sampleboxes[i] = box;
+
+               sample_extent = union_box3d(&(geom->bvol), sample_extent);
+               
+               // TODO: ask if we need geom or bvol size for stawidth
+               total_width += geom->size;
+               total_boxes_area += (box->high.x-box->low.x)*(box->high.y-box->low.y);
+
+       }
+
+       if ( ! notnull_cnt ) {
+               elog(NOTICE, " no notnull values, invalid stats");
+               stats->stats_valid = true;
+               return;
+       }
+
+       /*
+        * Create the histogram (GEOM_STATS)
+        */
+       old_context = MemoryContextSwitchTo(stats->anl_context);
+       size=sizeof(GEOM_STATS)+
+               (boxesPerSide*boxesPerSide-1)*sizeof(geomstats->value);
+       geomstats = palloc(size);
+       MemoryContextSwitchTo(old_context);
+
+       geomstats->xmin = sample_extent->LLB.x;
+       geomstats->ymin = sample_extent->LLB.y;
+       geomstats->xmax = sample_extent->URT.x;
+       geomstats->ymax = sample_extent->URT.y;
+       geomstats->boxesPerSide = boxesPerSide;
+       geomstats->avgFeatureArea = total_boxes_area/notnull_cnt;
+
+#if DEBUG_GEOMETRY_STATS
+       elog(NOTICE, " histo: xmin,ymin: %f,%f",
+                                                                       geomstats->xmin, geomstats->ymin);
+       elog(NOTICE, " histo: xmax,ymax: %f,%f",
+                                                                       geomstats->xmax, geomstats->ymax);
+       elog(NOTICE, " histo: boxesPerSide: %f", geomstats->boxesPerSide);
+       elog(NOTICE, " histo: avgFeatureArea: %f", geomstats->avgFeatureArea);
+#endif
+
+       /*
+        * Second scan:
+        *  o fill histogram values with the number of
+        *    features' bbox overlaps: a feature's bvol
+        *    can fully overlap (1) or partially overlap
+        *    (fraction of 1) an histogram cell.
+        */
+       for (i=0; i<samplerows; i++)
+       {
+                                       BOX *box;
+                                       double box_area;
+                                       if ( sampleboxes[i] == NULL ) continue;
+                                       box = (BOX *)sampleboxes[i];
+                                       box_area = (box->high.x-box->low.x)*(box->high.y-box->low.y);
+       }
+#if DEBUG_GEOMETRY_STATS
+       elog(NOTICE, " histo: values skipped (not implemented yet)");
+#endif
+
+       /*
+        * Write the statistics data 
+        */
+       stats->stakind[0] = STATISTIC_KIND_GEOMETRY;
+       stats->staop[0] = InvalidOid;
+       stats->stanumbers[0] = (float4 *)geomstats;
+       stats->numnumbers[0] = boxesPerSide*boxesPerSide+6;
+
+       stats->stanullfrac = null_cnt/samplerows;
+       stats->stawidth = total_width/notnull_cnt;
+       stats->stadistinct = -1.0;
+
+#if DEBUG_GEOMETRY_STATS
+       elog(NOTICE, " out: slot 0: kind %d (STATISTIC_KIND_GEOMETRY)",
+                                                                       stats->stakind[0]);
+       elog(NOTICE, " out: slot 0: op %d (InvalidOid)", stats->staop[0]);
+       elog(NOTICE, " out: slot 0: numnumbers %d", stats->numnumbers[0]);
+       elog(NOTICE, " out: null fraction: %d/%d", null_cnt, samplerows);
+       elog(NOTICE, " out: average width: %d bytes", stats->stawidth);
+       elog(NOTICE, " out: distinct values: all (no check done)");
+#endif
+
+       stats->stats_valid = true;
+}
+
+/*
+ * This function will be called when the ANALYZE command is run
+ * on a column of the "geometry" type.
+ * 
+ * It will need to return a stats builder function reference
+ * and a "minimum" sample rows to feed it.
+ * If we want analisys to be completely skipped we can return
+ * FALSE and leave output vals untouched.
+ *
+ * What we know from this call is:
+ *
+ *     o The pg_attribute row referring to the specific column.
+ *       Could be used to get reltuples from pg_class (which 
+ *       might quite inexact though...) and use them to set an
+ *       appropriate minimum number of sample rows to feed to
+ *       the stats builder. The stats builder will also receive
+ *       a more accurate "estimation" of the number or rows.
+ *
+ *     o The pg_type row for the specific column.
+ *       Could be used to set stat builder / sample rows
+ *       based on domain type (when postgis will be implemented
+ *       that way).
+ *
+ * Being this experimental we'll stick to a static stat_builder/sample_rows
+ * value for now.
+ *
+ */
+PG_FUNCTION_INFO_V1(geometry_analyze);
+Datum geometry_analyze(PG_FUNCTION_ARGS)
+{
+       VacAttrStats *stats = (VacAttrStats *)PG_GETARG_POINTER(0);
+       Form_pg_attribute attr = stats->attr;
+
+#if DEBUG_GEOMETRY_STATS
+       elog(NOTICE, "geometry_analyze called");
+#endif
+
+       /* If the attstattarget column is negative, use the default value */
+       /* NB: it is okay to scribble on stats->attr since it's a copy */
+        if (attr->attstattarget < 0)
+                attr->attstattarget = default_statistics_target;
+
+#if DEBUG_GEOMETRY_STATS
+       elog(NOTICE, " attribute stat target: %d", attr->attstattarget);
+#endif
+
+       /*
+        * There might be a reason not to analyze this column
+        * (can we detect the absence of an index?)
+        */
+       //elog(NOTICE, "compute_geometry_stats not implemented yet");
+       //PG_RETURN_BOOL(false);
+
+       /* Setup the minimum rows and the algorithm function */
+       stats->minrows = 300 * stats->attr->attstattarget;
+       stats->compute_stats = compute_geometry_stats;
+
+#if DEBUG_GEOMETRY_STATS
+       elog(NOTICE, " minrows: %d", stats->minrows);
+#endif
+
+       /* Indicate we are done successfully */
+       PG_RETURN_BOOL(true);
+}
+
+#endif
+