From f557a4cf468295ca71d133d9a6f3c17116d5881a Mon Sep 17 00:00:00 2001 From: Sandro Santilli Date: Mon, 23 Feb 2004 12:18:55 +0000 Subject: [PATCH] added skeleton functions for pg75 stats integration git-svn-id: http://svn.osgeo.org/postgis/trunk@448 b70326c6-7e19-0410-871a-916f4a2858ee --- Attic/postgis_sql_75_end.sql.in | 8 +- Attic/postgis_sql_75_start.sql.in | 19 +- postgis.h | 6 + postgis_estimate.c | 397 ++++++++++++++++++++++++++++++ 4 files changed, 420 insertions(+), 10 deletions(-) diff --git a/Attic/postgis_sql_75_end.sql.in b/Attic/postgis_sql_75_end.sql.in index cca969f92..989a65f7e 100644 --- a/Attic/postgis_sql_75_end.sql.in +++ b/Attic/postgis_sql_75_end.sql.in @@ -12,6 +12,9 @@ -- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -- $Log$ +-- Revision 1.2 2004/02/23 12:18:55 strk +-- added skeleton functions for pg75 stats integration +-- -- Revision 1.1 2004/02/04 22:27:02 strk -- Added pg75 support scripts -- @@ -157,11 +160,6 @@ CREATE FUNCTION estimate_histogram2d(histogram2d,box) AS '@MODULE_FILENAME@','estimate_histogram2d' LANGUAGE 'C' with (isstrict); -CREATE FUNCTION postgisgistcostestimate(internal,internal,internal,internal,internal,internal,internal,internal) - RETURNS opaque - AS '@MODULE_FILENAME@','postgisgistcostestimate' - LANGUAGE 'C' with (isstrict); - -- -- 7.2 GiST support functions -- diff --git a/Attic/postgis_sql_75_start.sql.in b/Attic/postgis_sql_75_start.sql.in index 1d1c54827..2f76df9d2 100644 --- a/Attic/postgis_sql_75_start.sql.in +++ b/Attic/postgis_sql_75_start.sql.in @@ -12,6 +12,9 @@ -- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -- $Log$ +-- Revision 1.2 2004/02/23 12:18:55 strk +-- added skeleton functions for pg75 stats integration +-- -- Revision 1.1 2004/02/04 22:27:02 strk -- Added pg75 support scripts -- @@ -36,11 +39,11 @@ BEGIN TRANSACTION; -- if it is, you get a message "PL/PgSQL is installed" -- otherwise it will give a big error message. -CREATE FUNCTION plpgsql_call_handler() RETURNS language_handler AS - '$libdir/plpgsql' LANGUAGE C; +--CREATE FUNCTION plpgsql_call_handler() RETURNS language_handler AS +-- '$libdir/plpgsql' LANGUAGE C; -CREATE TRUSTED PROCEDURAL LANGUAGE plpgsql - HANDLER plpgsql_call_handler; +--CREATE TRUSTED PROCEDURAL LANGUAGE plpgsql +-- HANDLER plpgsql_call_handler; --(select 'PL/PgSQL is installed.' as message from pg_language where lanname='plpgsql') union (select 'You must install PL/PgSQL before running this SQL file,\nor you will get an error. To install PL/PgSQL run:\n\tcreatelang plpgsql '::text as message) order by message limit 1; @@ -158,12 +161,18 @@ CREATE FUNCTION geometry_out(geometry) AS '@MODULE_FILENAME@' LANGUAGE 'C' WITH (isstrict); +CREATE FUNCTION geometry_analyze(internal) + RETURNS bool + AS '@MODULE_FILENAME@' + LANGUAGE 'C' WITH (isstrict); + CREATE TYPE geometry ( alignment = double, internallength = variable, input = geometry_in, output = geometry_out, - storage = main + storage = main, + analyze = geometry_analyze ); -- diff --git a/postgis.h b/postgis.h index 001c292a4..9d3080001 100644 --- a/postgis.h +++ b/postgis.h @@ -11,6 +11,9 @@ * ********************************************************************** * $Log$ + * Revision 1.42 2004/02/23 12:18:55 strk + * added skeleton functions for pg75 stats integration + * * Revision 1.41 2004/01/25 19:33:00 pramsey * Test commit on new CVS archive. * @@ -618,6 +621,9 @@ Datum geometry2box(PG_FUNCTION_ARGS); Datum explode_histogram2d(PG_FUNCTION_ARGS); Datum estimate_histogram2d(PG_FUNCTION_ARGS); +#if USE_VERSION >= 75 +Datum geometry_analyze(PG_FUNCTION_ARGS); +#endif Datum postgisgistcostestimate(PG_FUNCTION_ARGS); diff --git a/postgis_estimate.c b/postgis_estimate.c index e8d81b539..3398a5ef9 100644 --- a/postgis_estimate.c +++ b/postgis_estimate.c @@ -11,6 +11,9 @@ * ********************************************************************** * $Log$ + * Revision 1.8 2004/02/23 12:18:55 strk + * added skeleton functions for pg75 stats integration + * * Revision 1.7 2003/11/11 10:14:57 strk * Added support for PG74 * @@ -76,6 +79,39 @@ #include "executor/spi.h" +#if USE_VERSION >= 75 + +#define DEBUG_GEOMETRY_STATS 1 + +#include "commands/vacuum.h" + +/* + * Assign a number to the postgis statistics kind + * + * tgl suggested: + * + * 1-100: reserved for assignment by the core Postgres project + * 100-199: reserved for assignment by PostGIS + * 200-9999: reserved for other globally-known stats kinds + * 10000-32767: reserved for private site-local use + * + */ +#define STATISTIC_KIND_GEOMETRY 100 + +typedef struct GEOM_STATS_T +{ + //boxesPerSide * boxesPerSide = total boxes in grid + float4 boxesPerSide; + // average bbox area of features in this histogram + float4 avgFeatureArea; + // BOX of area + float4 xmin,ymin, xmax, ymax; + // variable length # of floats for histogram + float4 value[1]; +} GEOM_STATS; + +#endif + //estimate_histogram2d(histogram2d, box) // returns a % estimate of the # of features that will be returned by that box query @@ -575,6 +611,7 @@ get_restriction_var(List *args, +#if USE_VERSION < 75 //restriction in the GiST && operator PG_FUNCTION_INFO_V1(postgis_gist_sel); Datum postgis_gist_sel(PG_FUNCTION_ARGS) @@ -865,4 +902,364 @@ postgisgistcostestimate(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +#else // USE_VERSION >= 75 + +/* + * This function should return an estimation of the number of + * rows returned by a query involving an overlap check + * ( it's the restrict function for the && operator ) + * + * It can make use (if available) of the statistics collected + * by the geometry analyzer function. + * + */ +PG_FUNCTION_INFO_V1(postgis_gist_sel); +Datum postgis_gist_sel(PG_FUNCTION_ARGS) +{ + Query *root = (Query *) PG_GETARG_POINTER(0); + // Oid operator = PG_GETARG_OID(1); + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + GEOMETRY *in; + BOX *search_box; + char sql[1000]; + SPITupleTable *tuptable; + TupleDesc tupdesc ; + HeapTuple tuple ; + Datum datum; + bool isnull; + Var *var; + Node *other; + bool varonleft; + Oid relid; + int SPIcode; + double myest; + +#if DEBUG_GEOMETRY_STATS + elog(NOTICE, "postgis_gist_sel called"); + elog(NOTICE, " returning a fixed estimate of 0.000005%%"); +#endif + PG_RETURN_FLOAT8(0.000005); + + if (!get_restriction_var(args, varRelid, + &var, &other, &varonleft)) + { + //elog(NOTICE,"get_restriction_var FAILED -returning early"); + PG_RETURN_FLOAT8(0.000005); + } + + relid = getrelid(var->varno, root->rtable); + if (relid == InvalidOid) + { + //elog(NOTICE,"getrelid FAILED (invalid oid) -returning early"); + PG_RETURN_FLOAT8(0.000005); + } + + //elog(NOTICE,"operator's oid = %i (this should be GEOMETRY && GEOMETRY)",operator); + //elog(NOTICE,"relations' oid = %i (this should be the relation that the && is working on) ",relid); + //elog(NOTICE,"varatt oid = %i (basically relations column #) ",var->varattno); + + + if (IsA(other, Const) &&((Const *) other)->constisnull) + { + //elog(NOTICE,"other operand of && is NULL - returning early"); + PG_RETURN_FLOAT8(0.000005); + } + + if (IsA(other, Const)) + { + //elog(NOTICE,"The other side of the && is a constant with type (oid) = %i and length %i. This should be GEOMETRY with length -1 (variable length)",((Const*)other)->consttype,((Const*)other)->constlen); + } + else + { + //elog(NOTICE,"the other side of && isnt a constant - returning early"); + PG_RETURN_FLOAT8(0.000005); + } + + //get the BOX thats being searched in + in = (GEOMETRY*)PG_DETOAST_DATUM( ((Const*)other)->constvalue ); + search_box = convert_box3d_to_box(&in->bvol); + + //elog(NOTICE,"requested search box is : (%.15g %.15g, %.15g %.15g)",search_box->low.x,search_box->low.y,search_box->high.x,search_box->high.y); + + + SPIcode = SPI_connect(); + if (SPIcode != SPI_OK_CONNECT) + { + elog(NOTICE,"postgis_gist_sel: couldnt open a connection to SPI:%i",SPIcode); + PG_RETURN_FLOAT8(0.000005) ; + } + + sprintf(sql,"SELECT stats FROM GEOMETRY_COLUMNS WHERE attrelid=%u AND varattnum=%i",relid,var->varattno); + //elog(NOTICE,"sql:%s",sql); + SPIcode = SPI_exec(sql, 1 ); + if (SPIcode != SPI_OK_SELECT ) + { + SPI_finish(); + elog(NOTICE,"postgis_gist_sel: couldnt execute sql via SPI"); + PG_RETURN_FLOAT8(0.000005) ; + } + + if (SPI_processed !=1) + { + SPI_finish(); + //elog(NOTICE,"postgis_gist_sel: geometry_columns didnt return a unique value"); + PG_RETURN_FLOAT8(0.000005) ; + } + + tuptable = SPI_tuptable; + tupdesc = SPI_tuptable->tupdesc; + tuple = tuptable->vals[0]; + datum = SPI_getbinval(tuple, tupdesc, 1, &isnull); + if (isnull) + { + SPI_finish(); + //elog(NOTICE,"postgis_gist_sel: geometry_columns returned a null histogram"); + PG_RETURN_FLOAT8(0.000005) ; + } + + //elog(NOTICE,"postgis_gist_sel: checking against estimate_histogram2d"); + // now we have the histogram, and our search box - use the estimate_histogram2d(histo,box) to get the result! + myest = DatumGetFloat8( DirectFunctionCall2( estimate_histogram2d, datum, PointerGetDatum(search_box) ) ); + + if ( (myest<0) || (myest!=myest) ) // <0? or NaN? + { + //elog(NOTICE,"postgis_gist_sel: got something crazy back from estimate_histogram2d"); + PG_RETURN_FLOAT8(0.000005) ; + } + + SPIcode =SPI_finish(); + if (SPIcode != SPI_OK_FINISH ) + { + //elog(NOTICE,"postgis_gist_sel: couldnt disconnect from SPI"); + PG_RETURN_FLOAT8(0.000005) ; + } + //elog(NOTICE,"postgis_gist_sel: finished, returning with %lf",myest); + PG_RETURN_FLOAT8(myest); +} + +/* + * This function is called by the analyze function iff + * the geometry_analyze() function give it its pointer + * (this is always the case so far). + * The geometry_analyze() function is also responsible + * of deciding the number of "sample" rows we will receive + * here. It is able to give use other 'custom' data, but we + * won't use them so far. + * + * Our job is to build some statistics on the sample data + * for use by operator estimators. + * + * Currently we only need statistics to estimate the number of rows + * overlapping a given extent (estimation function bound + * to the && operator). + * + */ +static void +compute_geometry_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, + int samplerows, double totalrows) +{ + MemoryContext old_context; + int i; + int size; + BOX **sampleboxes; + GEOM_STATS *geomstats; + bool isnull; + int null_cnt=0, notnull_cnt=0; + BOX3D *sample_extent=NULL; + double total_width=0; + double total_boxes_area=0; + /* + * This is where geometry_analyze + * should put its' custom parameters. + * boxesPerSide would be a good candidate. + */ + //void *mystats = stats->extra_data; + int boxesPerSide = 40; + + + +#if DEBUG_GEOMETRY_STATS + elog(NOTICE, "compute_geometry_stats called"); + elog(NOTICE, " samplerows: %d", samplerows); +#endif + + sampleboxes = palloc(sizeof(BOX *)*samplerows); + + /* + * First scan: + * o find extent of the sample rows + * o count null/not-null values + * o compute total_width + * o compute total features's box area (for avgFeatureArea) + */ + for (i=0; ibvol); + sampleboxes[i] = box; + + sample_extent = union_box3d(&(geom->bvol), sample_extent); + + // TODO: ask if we need geom or bvol size for stawidth + total_width += geom->size; + total_boxes_area += (box->high.x-box->low.x)*(box->high.y-box->low.y); + + } + + if ( ! notnull_cnt ) { + elog(NOTICE, " no notnull values, invalid stats"); + stats->stats_valid = true; + return; + } + + /* + * Create the histogram (GEOM_STATS) + */ + old_context = MemoryContextSwitchTo(stats->anl_context); + size=sizeof(GEOM_STATS)+ + (boxesPerSide*boxesPerSide-1)*sizeof(geomstats->value); + geomstats = palloc(size); + MemoryContextSwitchTo(old_context); + + geomstats->xmin = sample_extent->LLB.x; + geomstats->ymin = sample_extent->LLB.y; + geomstats->xmax = sample_extent->URT.x; + geomstats->ymax = sample_extent->URT.y; + geomstats->boxesPerSide = boxesPerSide; + geomstats->avgFeatureArea = total_boxes_area/notnull_cnt; + +#if DEBUG_GEOMETRY_STATS + elog(NOTICE, " histo: xmin,ymin: %f,%f", + geomstats->xmin, geomstats->ymin); + elog(NOTICE, " histo: xmax,ymax: %f,%f", + geomstats->xmax, geomstats->ymax); + elog(NOTICE, " histo: boxesPerSide: %f", geomstats->boxesPerSide); + elog(NOTICE, " histo: avgFeatureArea: %f", geomstats->avgFeatureArea); +#endif + + /* + * Second scan: + * o fill histogram values with the number of + * features' bbox overlaps: a feature's bvol + * can fully overlap (1) or partially overlap + * (fraction of 1) an histogram cell. + */ + for (i=0; ihigh.x-box->low.x)*(box->high.y-box->low.y); + } +#if DEBUG_GEOMETRY_STATS + elog(NOTICE, " histo: values skipped (not implemented yet)"); +#endif + + /* + * Write the statistics data + */ + stats->stakind[0] = STATISTIC_KIND_GEOMETRY; + stats->staop[0] = InvalidOid; + stats->stanumbers[0] = (float4 *)geomstats; + stats->numnumbers[0] = boxesPerSide*boxesPerSide+6; + + stats->stanullfrac = null_cnt/samplerows; + stats->stawidth = total_width/notnull_cnt; + stats->stadistinct = -1.0; + +#if DEBUG_GEOMETRY_STATS + elog(NOTICE, " out: slot 0: kind %d (STATISTIC_KIND_GEOMETRY)", + stats->stakind[0]); + elog(NOTICE, " out: slot 0: op %d (InvalidOid)", stats->staop[0]); + elog(NOTICE, " out: slot 0: numnumbers %d", stats->numnumbers[0]); + elog(NOTICE, " out: null fraction: %d/%d", null_cnt, samplerows); + elog(NOTICE, " out: average width: %d bytes", stats->stawidth); + elog(NOTICE, " out: distinct values: all (no check done)"); +#endif + + stats->stats_valid = true; +} + +/* + * This function will be called when the ANALYZE command is run + * on a column of the "geometry" type. + * + * It will need to return a stats builder function reference + * and a "minimum" sample rows to feed it. + * If we want analisys to be completely skipped we can return + * FALSE and leave output vals untouched. + * + * What we know from this call is: + * + * o The pg_attribute row referring to the specific column. + * Could be used to get reltuples from pg_class (which + * might quite inexact though...) and use them to set an + * appropriate minimum number of sample rows to feed to + * the stats builder. The stats builder will also receive + * a more accurate "estimation" of the number or rows. + * + * o The pg_type row for the specific column. + * Could be used to set stat builder / sample rows + * based on domain type (when postgis will be implemented + * that way). + * + * Being this experimental we'll stick to a static stat_builder/sample_rows + * value for now. + * + */ +PG_FUNCTION_INFO_V1(geometry_analyze); +Datum geometry_analyze(PG_FUNCTION_ARGS) +{ + VacAttrStats *stats = (VacAttrStats *)PG_GETARG_POINTER(0); + Form_pg_attribute attr = stats->attr; + +#if DEBUG_GEOMETRY_STATS + elog(NOTICE, "geometry_analyze called"); +#endif + + /* If the attstattarget column is negative, use the default value */ + /* NB: it is okay to scribble on stats->attr since it's a copy */ + if (attr->attstattarget < 0) + attr->attstattarget = default_statistics_target; + +#if DEBUG_GEOMETRY_STATS + elog(NOTICE, " attribute stat target: %d", attr->attstattarget); +#endif + + /* + * There might be a reason not to analyze this column + * (can we detect the absence of an index?) + */ + //elog(NOTICE, "compute_geometry_stats not implemented yet"); + //PG_RETURN_BOOL(false); + + /* Setup the minimum rows and the algorithm function */ + stats->minrows = 300 * stats->attr->attstattarget; + stats->compute_stats = compute_geometry_stats; + +#if DEBUG_GEOMETRY_STATS + elog(NOTICE, " minrows: %d", stats->minrows); +#endif + + /* Indicate we are done successfully */ + PG_RETURN_BOOL(true); +} + +#endif + -- 2.40.0