From 7f24e6b2d00500a795a11e5a156ac876fae0f54e Mon Sep 17 00:00:00 2001 From: Sandro Santilli Date: Mon, 21 Jul 2014 16:39:56 +0000 Subject: [PATCH] Generalize estimator code to also work with expressional indexes Improves raster selectivity estimator (#2839) Includes regression tests git-svn-id: http://svn.osgeo.org/postgis/trunk@12814 b70326c6-7e19-0410-871a-916f4a2858ee --- NEWS | 4 ++ postgis/gserialized_estimate.c | 92 +++++++++++++++++----------------- regress/regress_index.sql | 31 +++++++----- regress/regress_index_expected | 6 ++- 4 files changed, 74 insertions(+), 59 deletions(-) diff --git a/NEWS b/NEWS index d9952d7f1..6ad847123 100644 --- a/NEWS +++ b/NEWS @@ -18,6 +18,7 @@ PostGIS 2.2.0 * New Features * - #2843, Support reprojection on raster import + (Sandro Santilli / Vizzuality) - #2349, Support for encoded_polyline input/output (Kashif Rasul) - #2159, report libjson version from postgis_full_version() - #2770, ST_MemSize(raster) @@ -41,6 +42,9 @@ PostGIS 2.2.0 * Enhancements * + - #2839, Implement selectivity estimator for functional indexes, + speeding up spatial queries on raster tables. + (Sandro Santilli / Vizzuality) - #2361, Added spatial_index column to raster_columns view - #2390, Testsuite for pgsql2shp - #2527, Added -k flag to raster2pgsql to skip checking that diff --git a/postgis/gserialized_estimate.c b/postgis/gserialized_estimate.c index 348769bc8..93eea1e3a 100644 --- a/postgis/gserialized_estimate.c +++ b/postgis/gserialized_estimate.c @@ -56,6 +56,7 @@ dimensionality cases. (2D geometry) &&& (3D column), etc. #include "utils/builtins.h" #include "utils/syscache.h" #include "utils/rel.h" +#include "utils/selfuncs.h" #include "../postgis_config.h" @@ -799,6 +800,35 @@ nd_increment(ND_IBOX *ibox, int ndims, int *counter) return TRUE; } +static ND_STATS* +pg_nd_stats_from_tuple(HeapTuple stats_tuple, int mode) +{ + int stats_kind = STATISTIC_KIND_ND; + int rv, nvalues; + float4 *floatptr; + ND_STATS *nd_stats; + + /* If we're in 2D mode, set the kind appropriately */ + if ( mode == 2 ) stats_kind = STATISTIC_KIND_2D; + + /* Then read the geom status histogram from that */ + rv = get_attstatsslot(stats_tuple, 0, 0, stats_kind, InvalidOid, + NULL, NULL, NULL, &floatptr, &nvalues); + if ( ! rv ) { + POSTGIS_DEBUGF(2, + "no slot of kind %d in stats tuple", stats_kind); + return NULL; + } + + /* Clone the stats here so we can release the attstatsslot immediately */ + nd_stats = palloc(sizeof(float) * nvalues); + memcpy(nd_stats, floatptr, sizeof(float) * nvalues); + + /* Clean up */ + free_attstatsslot(0, NULL, 0, floatptr, nvalues); + + return nd_stats; +} /** * Pull the stats object from the PgSQL system catalogs. Used @@ -808,10 +838,7 @@ static ND_STATS* pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode) { HeapTuple stats_tuple; - float4 *floatptr; ND_STATS *nd_stats; - int rv, nvalues; - int stats_kind = STATISTIC_KIND_ND; /* First pull the stats tuple */ stats_tuple = SearchSysCache2(STATRELATT, table_oid, att_num); @@ -820,32 +847,19 @@ pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode) POSTGIS_DEBUGF(2, "stats for \"%s\" do not exist", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL"); return NULL; } - - /* If we're in 2D mode, set the kind appropriately */ - if ( mode == 2 ) - stats_kind = STATISTIC_KIND_2D; - - /* Then read the geom status histogram from that */ - rv = get_attstatsslot(stats_tuple, 0, 0, stats_kind, InvalidOid, NULL, NULL, NULL, &floatptr, &nvalues); - if ( ! rv ) + + nd_stats = pg_nd_stats_from_tuple(stats_tuple, mode); + ReleaseSysCache(stats_tuple); + if ( ! nd_stats ) { - ReleaseSysCache(stats_tuple); - POSTGIS_DEBUGF(2, "histogram for \"%s\" does not exist?", get_rel_name(table_oid)); - return NULL; + POSTGIS_DEBUGF(2, + "histogram for attribute %d of table \"%s\" does not exist?", + att_num, get_rel_name(table_oid)); } - /* Clone the stats here so we can release the attstatsslot immediately */ - nd_stats = palloc(sizeof(float) * nvalues); - memcpy(nd_stats, floatptr, sizeof(float) * nvalues); - - /* Clean up */ - free_attstatsslot(0, NULL, 0, floatptr, nvalues); - ReleaseSysCache(stats_tuple); - return nd_stats; } - /** * Pull the stats object from the PgSQL system catalogs. The * debugging functions are taking human input (table names) @@ -2052,8 +2066,8 @@ Datum gserialized_gist_sel(PG_FUNCTION_ARGS) /* int varRelid = PG_GETARG_INT32(3); */ int mode = PG_GETARG_INT32(4); - Oid relid; - ND_STATS *nd_stats; + VariableStatData vardata; + ND_STATS *nd_stats = NULL; Node *other; Var *self; @@ -2096,25 +2110,6 @@ Datum gserialized_gist_sel(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(DEFAULT_ND_SEL); } - /* - * We don't have a nice && or && - * situation here. && would probably get evaluated - * away by PgSQL earlier on. && is harder, and the - * case we get often is && ST_Expand(), which does - * actually have a subtly different selectivity than a bae - * && call. It's calculatable though, by expanding - * every cell in the histgram appropriately. - * - * Discussion: http://trac.osgeo.org/postgis/ticket/1828 - * - * To do? Do variable selectivity based on the node. - */ - if ( ! IsA(self, Var) ) - { - POSTGIS_DEBUG(3, " no bare variable argument ? - returning a moderate selectivity"); - PG_RETURN_FLOAT8(FALLBACK_ND_SEL); - } - /* Convert the constant to a BOX */ if( ! gserialized_datum_get_gbox_p(((Const*)other)->constvalue, &search_box) ) { @@ -2124,13 +2119,18 @@ Datum gserialized_gist_sel(PG_FUNCTION_ARGS) POSTGIS_DEBUGF(4, " requested search box is: %s", gbox_to_string(&search_box)); /* Get pg_statistic row */ - relid = getrelid(self->varno, root->parse->rtable); - nd_stats = pg_get_nd_stats(relid, self->varattno, mode); + examine_variable(root, (Node*)self, 0, &vardata); + if ( vardata.statsTuple ) { + nd_stats = pg_nd_stats_from_tuple(vardata.statsTuple, mode); + } + ReleaseVariableStats(vardata); + if ( ! nd_stats ) { POSTGIS_DEBUG(3, " unable to load stats from syscache, not analyzed yet?"); PG_RETURN_FLOAT8(FALLBACK_ND_SEL); } + POSTGIS_DEBUGF(4, " got stats:\n%s", nd_stats_to_json(nd_stats)); /* Do the estimation! */ diff --git a/regress/regress_index.sql b/regress/regress_index.sql index 6cd666779..e3b90c32c 100644 --- a/regress/regress_index.sql +++ b/regress/regress_index.sql @@ -46,25 +46,32 @@ $$; -- There are 50000 points in the table with full extent being -- BOX(0.0439142361 0.0197799355,999.955261 999.993652) +CREATE TABLE sample_queries AS +SELECT 1 as id, 5 as tol, 'ST_MakeEnvelope(125,125,135,135)' as box + UNION ALL +SELECT 2, 60, 'ST_MakeEnvelope(0,0,135,135)' + UNION ALL +SELECT 3, 500, 'ST_MakeEnvelope(0,0,500,500)' + UNION ALL +SELECT 4, 600, 'ST_MakeEnvelope(0,0,1000,1000)' +; ANALYZE test; -SELECT estimate_error(' - select num from test where the_geom && ST_MakeEnvelope(125,125,135,135); -', 5); +SELECT estimate_error( + 'select num from test where the_geom && ' || box, tol ) + FROM sample_queries ORDER BY id; -select estimate_error(' - select num from test where the_geom && ST_MakeEnvelope(0,0,135,135); -', 50); +-- Test selectivity estimation of functional indexes -SELECT estimate_error(' - select num from test where the_geom && ST_MakeEnvelope(0,0,500,500); -', 500); +CREATE INDEX expressional_gist on test using gist ( st_centroid(the_geom) ); +ANALYZE test; -SELECT estimate_error(' - select num from test where the_geom && ST_MakeEnvelope(0,0,1000,1000); -', 600); +SELECT 'expr', estimate_error( + 'select num from test where st_centroid(the_geom) && ' || box, tol ) + FROM sample_queries ORDER BY id; DROP TABLE test; +DROP TABLE sample_queries; DROP FUNCTION estimate_error(text, int); diff --git a/regress/regress_index_expected b/regress/regress_index_expected index 035aa0f3d..012e42245 100644 --- a/regress/regress_index_expected +++ b/regress/regress_index_expected @@ -5,6 +5,10 @@ 3618|POINT(130.447205 131.655289) 7245|POINT(128.10466 130.94133) 3+=5:true -924+=50:true +924+=60:true 12621+=500:true 50000+=600:true +expr|3+=5:true +expr|924+=60:true +expr|12621+=500:true +expr|50000+=600:true -- 2.40.0