put in a batch geocoding example, more performance enhancements and loading changes.

author Regina Obe <lr@pcorp.us>

Thu, 27 Jan 2011 18:17:03 +0000 (18:17 +0000)

committer Regina Obe <lr@pcorp.us>

Thu, 27 Jan 2011 18:17:03 +0000 (18:17 +0000)
author Regina Obe <lr@pcorp.us>
Thu, 27 Jan 2011 18:17:03 +0000 (18:17 +0000)
committer Regina Obe <lr@pcorp.us>
Thu, 27 Jan 2011 18:17:03 +0000 (18:17 +0000)
diff --git a/doc/extras_tigergeocoder.xml b/doc/extras_tigergeocoder.xml

index f3d9679b12d417c9940e918f49fb741d0861bcc8..aaaf3da0ec1c5b693780830447124761527be0e4 100644 (file)
--- a/doc/extras_tigergeocoder.xml
+++ b/doc/extras_tigergeocoder.xml
@@ -6,10 +6,10 @@
                 <abstract>
                         <para>A plpgsql based geocoder written for <ulink url="http://www.census.gov/geo/www/tiger/index.html">TIGER census data</ulink>.</para>
                         <para>Design:</para>
-                       <para>There are two components to the geocoder, the address normalizer and the address geocoder. </para>
+                       <para>There are three components to the geocoder, the data loader functions, the address normalizer and the address geocoder. </para>
                         <para>The goal of this project is to build a fully functional geocoder that can process an arbitrary 
                         address string and, using normalized TIGER census data, produce a point geometry and rating reflecting the location of the given address.</para>
-                       <para>The geocoder should be simple for anyone familiar with PostGIS to install and use.</para>
+                       <para>The geocoder should be simple for anyone familiar with PostGIS to install and use, and should be easily installable and usable on all platforms supported by PostGIS.</para>
                         <para>It should be robust enough to function properly despite formatting and spelling errors.</para>
                         <para>It should be extensible enough to be used with future data updates, or alternate data sources with a minimum of coding changes.</para>
                 </abstract>
@@ -19,25 +19,25 @@
           <refnamediv>
                 <refname>Geocode</refname>
  
-               <refpurpose>Takes in an address as a string and outputs a set of possible locations which include a point geometry in NAD 83 long lat, a normalized address for each, and the rating.  The lower the rating the more likely the match.  
+               <refpurpose>Takes in an address as a string (or other normalized address) and outputs a set of possible locations which include a point geometry in NAD 83 long lat, a normalized address for each, and the rating.  The lower the rating the more likely the match.  
                         Results are sorted by lowest rating first.</refpurpose>
           </refnamediv>
  
           <refsynopsisdiv>
                 <funcsynopsis>
                   <funcprototype>
-                       <funcdef>setof record<function>geocode</function></funcdef>
+                       <funcdef>setof record <function>geocode</function></funcdef>
                         <paramdef><type>address </type> <parameter>varchar</parameter></paramdef>
-                       <paramdef><type>OUT addy</type> <parameter>norm_addy</parameter></paramdef>
-                       <paramdef><type>OUT geomout</type> <parameter>geometry</parameter></paramdef>
-                       <paramdef><type>OUT rating</type> <parameter>integer</parameter></paramdef>
+                       <paramdef><type>OUT addy </type> <parameter>norm_addy</parameter></paramdef>
+                       <paramdef><type>OUT geomout </type> <parameter>geometry</parameter></paramdef>
+                       <paramdef><type>OUT rating </type> <parameter>integer</parameter></paramdef>
                   </funcprototype>
                   <funcprototype>
-                       <funcdef>setof record<function>geocode</function></funcdef>
+                       <funcdef>setof record <function>geocode</function></funcdef>
                         <paramdef><type>in_addy </type> <parameter>norm_addy</parameter></paramdef>
-                       <paramdef><type>OUT addy</type> <parameter>norm_addy</parameter></paramdef>
-                       <paramdef><type>OUT geomout</type> <parameter>geometry</parameter></paramdef>
-                       <paramdef><type>OUT rating</type> <parameter>integer</parameter></paramdef>
+                       <paramdef><type>OUT addy </type> <parameter>norm_addy</parameter></paramdef>
+                       <paramdef><type>OUT geomout </type> <parameter>geometry</parameter></paramdef>
+                       <paramdef><type>OUT rating </type> <parameter>integer</parameter></paramdef>
                   </funcprototype>
                 </funcsynopsis>
           </refsynopsisdiv>
@@ -45,8 +45,8 @@
           <refsection>
                 <title>Description</title>
  
-               <para>Takes in an address as a string and outputs a set of possible locations which include a point geometry in NAD 83 long lat, a <varname>normalized_address</varname> (addy) for each, and the rating.  The lower the rating the more likely the match.  
-                       Results are sorted by lowest rating first.  The higher the rating the less likely the geocode is right.</para>
+               <para>Takes in an address as a string (or already normalized address) and outputs a set of possible locations which include a point geometry in NAD 83 long lat, a <varname>normalized_address</varname> (addy) for each, and the rating.  The lower the rating the more likely the match.  
+                       Results are sorted by lowest rating first.  Uses Tiger data (edges,faces,addr), PostgreSQL fuzzy string matching (soundex,levenshtein) and PostGIS line interpolation functions to interpolate address along the Tiger edges. The higher the rating the less likely the geocode is right.</para>
  
                 <para>Enhanced: 2.0.0 to support Tiger 2010 structured data and revised some logic to improve speed.</para>
  
@@ -56,8 +56,10 @@
  
           <refsection>
                 <title>Examples</title>
-               <para>Exact matches are fairly fast (205ms)</para>
-               <programlisting>SELECT g.rating, ST_X(g.geomout) As lon, ST_Y(g.geomout) As lat, (addy).address As stno, (addy).streetname As street, 
+               <para>The below examples timings are on a fairly old 1.9 GHZ single processor Windows XP machine with 3GB ram running PostgreSQL 9/PostGIS 2.0 loaded with all of Massachusetts state Tiger data.</para>
+               <para>Exact matches are faster to compute (205ms)</para>
+               <programlisting>SELECT g.rating, ST_X(g.geomout) As lon, ST_Y(g.geomout) As lat, 
+       (addy).address As stno, (addy).streetname As street, 
         (addy).streettypeabbrev As styp, (addy).location As city, (addy).stateabbrev As st,(addy).zip 
         FROM geocode('75 State Street, Boston MA 02109') As g;  
   rating |        lon        |       lat        | stno | street | styp |  city   |st  |  zip
@@ -65,15 +67,17 @@
        0 | -71.0556974285714 | 42.3590795714286 |   75 | State  | St   | Boston  | MA | 02109
  </programlisting>
                 <para>Even if zip is not passed in the geocoder can guess (took about 450 ms)</para>
-               <programlisting>SELECT g.rating, ST_AsText(ST_SnapToGrid(g.geomout,0.00001)) As wktlonlat, (addy).address As stno, (addy).streetname As street, 
+               <programlisting>SELECT g.rating, ST_AsText(ST_SnapToGrid(g.geomout,0.00001)) As wktlonlat, 
+       (addy).address As stno, (addy).streetname As street, 
         (addy).streettypeabbrev As styp, (addy).location As city, (addy).stateabbrev As st,(addy).zip 
         FROM geocode('226 Hanover Street, Boston, MA') As g;  
   rating |         wktlonlat         | stno | street  | styp |  city  | st |  zip
  --------+---------------------------+------+---------+------+--------+----+-------
        0 | POINT(-71.05518 42.36311) |  226 | Hanover | St   | Boston | MA | 02113
  </programlisting>     
-<para>Can handle misspellings and provides more than one possible solution with rankings and takes longer (4 seconds).</para>
-<programlisting>SELECT g.rating, ST_AsText(ST_SnapToGrid(g.geomout,0.00001)) As wktlonlat, (addy).address As stno, (addy).streetname As street, 
+<para>Can handle misspellings and provides more than one possible solution with ratings and takes longer (4 seconds).</para>
+<programlisting>SELECT g.rating, ST_AsText(ST_SnapToGrid(g.geomout,0.00001)) As wktlonlat, 
+    (addy).address As stno, (addy).streetname As street, 
         (addy).streettypeabbrev As styp, (addy).location As city, (addy).stateabbrev As st,(addy).zip 
         FROM geocode('31 - 37 Stewart Street, Boston, MA 02116') As g; 
   rating |         wktlonlat         | stno | street  | styp |     city      | st|  zip
@@ -91,6 +95,48 @@
       70 | POINT(-71.0646 42.35105)  |   31 | Stuart  | St   | Boston        | MA | 02116
  (11 rows)      </programlisting>
  
+<para>Using to do a batch geocode of addresses</para>
+<programlisting>CREATE TABLE addresses_to_geocode(addid serial PRIMARY KEY, address text,
+               lon numeric, lat numeric, new_address text, rating integer);
+
+INSERT INTO addresses_to_geocode(address)
+VALUES ('529 Main Street, Boston MA, 02129'),
+ ('77 Massachusetts Avenue, Cambridge, MA 02139'),
+ ('28 Capen Street, Medford, MA'),
+ ('124 Mount Auburn St, Cambridge, Massachusetts 02138'),
+ ('950 Main Street, Worcester, MA 01610');
+ 
+-- only update the first two addresses --
+-- for large numbers of addresses you don't want to update all at once
+-- since the whole geocode must commit at once (828 ms)
+UPDATE addresses_to_geocode
+  SET  (rating, new_address, lon, lat) 
+       = (g.rating, COALESCE( (g.addy).address::text, '')
+       || COALESCE(' ' || trim((g.addy).predirabbrev ) , '') 
+       || COALESCE(' ' || trim((g.addy).streetname ),'') 
+        || COALESCE(' ' || trim((g.addy).streettypeabbrev ), '') 
+       || COALESCE(' ' || (g.addy).location , '')
+       || COALESCE(', ' || (g.addy).stateabbrev, '') 
+    || COALESCE(' ' || (g.addy).zip, ''),
+               ST_X(g.geomout), ST_Y(g.geomout) )
+FROM (SELECT DISTINCT ON (addid) addid, (g1.geo).*
+       FROM (SELECT addid,  (geocode(address)) As geo
+FROM addresses_to_geocode As ag
+       WHERE ag.rating IS NULL ) As g1
+ORDER BY addid, rating LIMIT 2) As g
+WHERE g.addid = addresses_to_geocode.addid;
+
+result
+-----
+2 rows affected, 850 ms execution time.
+
+SELECT * FROM addresses_to_geocode WHERE rating is not null;
+
+ addid |                   address                    |    lon    |   lat    |             new_address                  | rating
+-------+----------------------------------------------+-----------+----------+------------------------------------------+--------
+     1 | 529 Main Street, Boston MA, 02129            | -71.07187 | 42.38351 | 529 Main St Boston, MA 02129             |      0
+     2 | 77 Massachusetts Avenue, Cambridge, MA 02139 | -71.09436 | 42.35981 | 77 Massachusetts Ave Cambridge, MA 02139 |      0</programlisting>
+
           </refsection>
  
           <!-- Optionally add a "See Also" section -->
diff --git a/extras/tiger_geocoder/tiger_geocoder_2010/create_geocode.sql b/extras/tiger_geocoder/tiger_geocoder_2010/create_geocode.sql

index 3e3fc51c7cae41009b82db2ecc6dc74389b57268..ce9e617373010575127d00292c970287ea4919de 100644 (file)
--- a/extras/tiger_geocoder/tiger_geocoder_2010/create_geocode.sql
+++ b/extras/tiger_geocoder/tiger_geocoder_2010/create_geocode.sql
@@ -41,6 +41,7 @@ CREATE TYPE norm_addy AS (
  \r
  ---- Geocoder functions\r
  -- General helpers\r
+\i geocode/other_helper_functions.sql\r
  \i geocode/rate_attributes.sql\r
  \i geocode/includes_address.sql\r
  \i geocode/interpolate_from_address.sql\r
diff --git a/extras/tiger_geocoder/tiger_geocoder_2010/geocode/geocode_address.sql b/extras/tiger_geocoder/tiger_geocoder_2010/geocode/geocode_address.sql

index bd039b2a575601430477b360fa530da6f269d417..f6b685fe3fde00bfdcd856730b70cbcbfdca2849 100644 (file)
--- a/extras/tiger_geocoder/tiger_geocoder_2010/geocode/geocode_address.sql
+++ b/extras/tiger_geocoder/tiger_geocoder_2010/geocode/geocode_address.sql
@@ -67,24 +67,24 @@ BEGIN
           || '    a.suftypabrv, ' || coalesce(quote_literal(parsed.postDirAbbrev),'NULL') || ','\r
           || '    a.sufdirabrv) + '\r
           || '    CASE '\r
-         || '        WHEN ' || coalesce(quote_literal(parsed.address),'NULL') || ' IS NULL OR b.fromhn IS NULL THEN 20'\r
-         || '        WHEN ' || coalesce(quote_literal(parsed.address),'NULL') || ' >= least(to_number(b.fromhn,''99999999''),to_number(b.tohn,''99999999''))'\r
-         || '            AND ' || coalesce(quote_literal(parsed.address),'NULL') || ' <= greatest(to_number(b.fromhn,''99999999''),to_number(b.tohn,''99999999''))'\r
-         || '            AND (' || coalesce(quote_literal(parsed.address),'NULL') || ' % 2) = (to_number(b.fromhn,''99999999'') % 2)'\r
+         || '        WHEN ' || coalesce(quote_literal(parsed.address),'NULL') || '::integer IS NULL OR b.fromhn IS NULL THEN 20'\r
+         || '        WHEN ' || coalesce(quote_literal(parsed.address),'NULL') || '::integer >= least_hn(b.fromhn, b.tohn) '\r
+         || '            AND ' || coalesce(quote_literal(parsed.address),'NULL') || '::integer <= greatest_hn(b.fromhn,b.tohn)'\r
+         || '            AND (' || coalesce(quote_literal(parsed.address),'NULL') || '::integer % 2) = (to_number(b.fromhn,''99999999'') % 2)::integer'\r
           || '            THEN 0'\r
-         || '        WHEN ' || coalesce(quote_literal(parsed.address),'NULL') || ' >= least(to_number(b.fromhn,''99999999''),to_number(b.tohn,''99999999''))'\r
-         || '            AND ' || coalesce(quote_literal(parsed.address),'NULL') || ' <= greatest(to_number(b.fromhn,''99999999''),to_number(b.tohn,''99999999''))'\r
+         || '        WHEN ' || coalesce(quote_literal(parsed.address),'NULL') || '::integer >= least_hn(b.fromhn,b.tohn)'\r
+         || '            AND ' || coalesce(quote_literal(parsed.address),'NULL') || '::integer <= greatest_hn(b.fromhn,b.tohn)'\r
           || '            THEN 2'\r
           || '        ELSE'\r
           || '            ((1.0 - '\r
-         ||              '(least(' || coalesce(quote_literal(parsed.address || '.0'),'NULL') || ',to_number(b.fromhn,''99999999''),to_number(b.tohn,''99999999'')) /'\r
-         ||              ' greatest(' || coalesce(quote_literal(parsed.address || '.0'),'NULL') || ',to_number(b.fromhn,''99999999''),to_number(b.tohn,''99999999'')))'\r
+         ||              '(least(' || coalesce(quote_literal(parsed.address || '.0'),'NULL') || '::numeric,to_number(b.fromhn,''99999999''),to_number(b.tohn,''99999999'')) /'\r
+         ||              ' greatest(' || coalesce(quote_literal(parsed.address || '.0'),'NULL') || '::numeric,to_number(b.fromhn,''99999999''),to_number(b.tohn,''99999999'')))'\r
           ||              ') * 5)::integer + 5'\r
           || '        END'\r
           || '    as sub_rating,'\r
-         ||       coalesce(quote_literal(parsed.address),'NULL') || ' >= least(to_number(b.fromhn,''99999999''),to_number(b.tohn,''99999999''))'\r
-         || '            AND ' || coalesce(quote_literal(parsed.address),'NULL') || ' <= greatest(to_number(b.fromhn,''99999999''),to_number(b.tohn,''99999999''))'\r
-         || '            AND (' || coalesce(quote_literal(parsed.address),'NULL') || ' % 2) = (to_number(b.fromhn,''99999999'') % 2)'\r
+         ||       coalesce(quote_literal(parsed.address),'NULL') || '::integer >= least_hn(b.fromhn,b.tohn) '\r
+         || '            AND ' || coalesce(quote_literal(parsed.address),'NULL') || '::integer <= greatest_hn(b.fromhn,b.tohn) '\r
+         || '            AND (' || coalesce(quote_literal(parsed.address),'NULL') || ' % 2)::numeric::integer = (to_number(b.fromhn,''99999999'') % 2)'\r
           || '    as exact_address'\r
           || '  FROM featnames a join addr b using (tlid,statefp)'\r
           || '  WHERE'\r
@@ -109,7 +109,7 @@ BEGIN
           || ' ORDER BY 1,2,3,4,5,6,7,9'\r
           || ' LIMIT 10'\r
           ;\r
-\r
+    --RAISE NOTICE '%', stmt;\r
      -- If we got an exact street match then when we hit the non-exact\r
      -- set of tests, just drop out.\r
      IF NOT zip_info.exact AND exact_street THEN\r
diff --git a/extras/tiger_geocoder/tiger_geocoder_2010/geocode/other_helper_functions.sql b/extras/tiger_geocoder/tiger_geocoder_2010/geocode/other_helper_functions.sql

new file mode 100644 (file)

index 0000000..f76d95b
--- /dev/null
+++ b/extras/tiger_geocoder/tiger_geocoder_2010/geocode/other_helper_functions.sql
@@ -0,0 +1,17 @@
+-- Note we are wrapping this in a function so we can make it immutable and those useable in an index\r
+-- It also allows us to shorten and possibly better cache the repetitive pattern in the code \r
+-- greatest(to_number(b.fromhn,''99999999''),to_number(b.tohn,''99999999'')) \r
+-- and least(to_number(b.fromhn,''99999999''),to_number(b.tohn,''99999999''))\r
+CREATE OR REPLACE FUNCTION least_hn(fromhn varchar, tohn varchar)\r
+  RETURNS integer AS\r
+$$ SELECT least(to_number($1,'99999999'),to_number($2,'99999999') )::integer;  $$\r
+  LANGUAGE sql IMMUTABLE\r
+  COST 1;\r
+  \r
+-- Note we are wrapping this in a function so we can make it immutable (for some reason least and greatest aren't considered immutable)\r
+-- and those useable in an index\r
+CREATE OR REPLACE FUNCTION greatest_hn(fromhn varchar, tohn varchar)\r
+  RETURNS integer AS\r
+$$ SELECT greatest(to_number($1,'99999999'),to_number($2,'99999999') )::integer;  $$\r
+  LANGUAGE sql IMMUTABLE\r
+  COST 1;\r
diff --git a/extras/tiger_geocoder/tiger_geocoder_2010/tiger_loader.sql b/extras/tiger_geocoder/tiger_geocoder_2010/tiger_loader.sql

index 5959231fb27238dc808123f509dccaa166cf4b4d..06e8889f7b14705c5e90cfef6cba643e037cc825 100644 (file)
--- a/extras/tiger_geocoder/tiger_geocoder_2010/tiger_loader.sql
+++ b/extras/tiger_geocoder/tiger_geocoder_2010/tiger_loader.sql
@@ -1,4 +1,5 @@
-set search_path = tiger,public;\r
+SET search_path TO tiger,public;\r
+\r
  DROP AGGREGATE IF EXISTS array_accum(anyelement);\r
  CREATE AGGREGATE array_accum(anyelement) (\r
    SFUNC=array_append,\r
@@ -159,9 +160,10 @@ ${psql} -c "CREATE INDEX idx_${data_schema}_${state_abbrev}_zip_lookup_base_city
         \r
  INSERT INTO loader_lookuptables(process_order, lookup_name, table_name, load, level_county, level_state, single_geom_mode, insert_mode, pre_load_process, post_load_process,columns_exclude )\r
  VALUES(9, 'addr', 'addr', true, true, false,false, 'a', \r
-       '${psql} -c "CREATE TABLE ${data_schema}.${state_abbrev}_${table_name}(CONSTRAINT pk_${state_abbrev}_${table_name} PRIMARY KEY (gid)) INHERITS(${table_name});" ',\r
-       '${psql} -c "UPDATE ${data_schema}.${state_abbrev}_${table_name} SET statefp = ''${state_fips}'' WHERE statefp IS NULL;"\r
-       ${psql} -c "ALTER TABLE ${data_schema}.${state_abbrev}_${table_name} ADD CONSTRAINT chk_statefp CHECK (statefp = ''${state_fips}'');"\r
+       '${psql} -c "CREATE TABLE ${data_schema}.${state_abbrev}_${lookup_name}(CONSTRAINT pk_${state_abbrev}_${table_name} PRIMARY KEY (gid)) INHERITS(${table_name});" ',\r
+       '${psql} -c "UPDATE ${data_schema}.${state_abbrev}_${lookup_name} SET statefp = ''${state_fips}'' WHERE statefp IS NULL;"\r
+       ${psql} -c "ALTER TABLE ${data_schema}.${state_abbrev}_${lookup_name} ADD CONSTRAINT chk_statefp CHECK (statefp = ''${state_fips}'');"\r
+       ${psql} -c "CREATE INDEX idx_${data_schema}_${state_abbrev}_${lookup_name}_least_address ON tiger_data.ma_addr USING btree (least_hn(fromhn,tohn) )\r
         ${psql} -c "CREATE INDEX idx_${data_schema}_${state_abbrev}_${table_name}_tlid_statefp ON ${data_schema}.${state_abbrev}_${table_name} USING btree (tlid, statefp);"\r
         ${psql} -c "CREATE INDEX idx_${data_schema}_${state_abbrev}_${table_name}_zip ON ${data_schema}.${state_abbrev}_${table_name} USING btree (zip);"\r
         ${psql} -c "CREATE TABLE ${data_schema}.${state_abbrev}_zip_state(CONSTRAINT pk_${state_abbrev}_zip_state PRIMARY KEY(zip,stusps)) INHERITS(zip_state); "\r
@@ -169,6 +171,7 @@ VALUES(9, 'addr', 'addr', true, true, false,false, 'a',
         ${psql} -c "ALTER TABLE ${data_schema}.${state_abbrev}_zip_state ADD CONSTRAINT chk_statefp CHECK (statefp = ''${state_fips}'');"\r
         ${psql} -c "vacuum analyze ${data_schema}.${state_abbrev}_${lookup_name};"',  ARRAY['gid','statefp','fromarmid', 'toarmid']);\r
  \r
+\r
  CREATE OR REPLACE FUNCTION loader_generate_script(param_states text[], os text)\r
    RETURNS SETOF text AS\r
  $BODY$\r
author	Regina Obe <lr@pcorp.us>
	Thu, 27 Jan 2011 18:17:03 +0000 (18:17 +0000)
committer	Regina Obe <lr@pcorp.us>
	Thu, 27 Jan 2011 18:17:03 +0000 (18:17 +0000)
doc/extras_tigergeocoder.xml		patch \| blob \| history
extras/tiger_geocoder/tiger_geocoder_2010/create_geocode.sql		patch \| blob \| history
extras/tiger_geocoder/tiger_geocoder_2010/geocode/geocode_address.sql		patch \| blob \| history
extras/tiger_geocoder/tiger_geocoder_2010/geocode/other_helper_functions.sql	[new file with mode: 0644]	patch \| blob
extras/tiger_geocoder/tiger_geocoder_2010/tiger_loader.sql		patch \| blob \| history