]> granicus.if.org Git - postgis/commitdiff
#3572, ST_ClusterDBSCAN should not join clusters by their borders
authorDaniel Baston <dbaston@gmail.com>
Mon, 6 Jun 2016 12:24:32 +0000 (12:24 +0000)
committerDaniel Baston <dbaston@gmail.com>
Mon, 6 Jun 2016 12:24:32 +0000 (12:24 +0000)
git-svn-id: http://svn.osgeo.org/postgis/trunk@14934 b70326c6-7e19-0410-871a-916f4a2858ee

doc/reference_measure.xml
liblwgeom/cunit/cu_geos_cluster.c
liblwgeom/cunit/cu_tester.h
liblwgeom/cunit/cu_unionfind.c
liblwgeom/lwgeom_geos.h
liblwgeom/lwgeom_geos_cluster.c
liblwgeom/lwunionfind.c
liblwgeom/lwunionfind.h
postgis/lwgeom_geos.c
postgis/lwgeom_window.c
regress/cluster_expected

index 98dd6f591a7bc78135bd91d4a7bc880832005eaf..f13824fd94524e4c72b995de4cb42fe7b910048f 100644 (file)
@@ -1077,15 +1077,37 @@ SELECT ST_AsText(
          <refsection>
       <title>Description</title>
 
-      <para>Returns cluster number for each input geometry, based on a 2D 
-          implementation of the 
+         <para>
+                 Returns cluster number for each input geometry, based on a 2D implementation of the 
           <ulink url="https://en.wikipedia.org/wiki/DBSCAN">Density-based spatial clustering of applications with noise (DBSCAN)</ulink> 
-          algorithm.  An input geometry will be added to a cluster if it is 
-          within <varname>eps</varname> distance of at least
-          <varname>minpoints</varname> other input geometries.  Unlike <xref linkend="ST_ClusterKMeans" />, it does not require the number of
-          clusters to be specified, but instead uses the desired distance (<varname>eps</varname>) and density(<varname>minpoints</varname>) parameters 
-          to construct each cluster.  Input geometries that do not meet the criteria to join any other cluster will be assigned a cluster number of NULL.
-      </para>
+                 algorithm.  Unlike <xref linkend="ST_ClusterKMeans" />, it does not require the number of clusters to be specified, but instead 
+                 uses the desired distance (<varname>eps</varname>) and density(<varname>minpoints</varname>) parameters to construct each cluster.
+         </para>
+                 
+         <para>
+                 An input geometry will be added to a cluster if it is either:
+                 <itemizedlist>
+                         <listitem>
+                                 A "core" geometry, that is within <varname>eps</varname> distance of at least <varname>minpoints</varname> other input geometries, or
+                         </listitem>
+                         <listitem>
+                                 A "border" geometry, that is within <varname>eps</varname> distance of a core geometry.
+                         </listitem>
+                 </itemizedlist>
+               </para>
+
+               <para>
+                 Note that border geometries may be within <varname>eps</varname> distance of core geometries in more than one cluster; in this
+                 case, either assignment would be correct, and the border geometry will be arbitrarily asssigned to one of the available clusters.
+                 In these cases, it is possible for a correct cluster to be generated with fewer than <varname>minpoints</varname> geometries.
+                 When assignment of a border geometry is ambiguous, repeated calls to ST_ClusterDBSCAN will produce identical results if an ORDER BY 
+                 clause is included in the window definition, but cluster assignments may differ from other implementations of the same algorithm.
+         </para>
+
+         <para>
+                 Input geometries that do not meet the criteria to join any other cluster will be assigned a cluster number of NULL.
+         </para>
+
       <para>Availability: 2.3.0 - requires GEOS </para>
     </refsection>
 
index 80215e935fb698673fe10f41bc17d0f44d38fcc0..76720b9c4e7ffa6d13b228da9da8d56425fc9f1f 100644 (file)
@@ -255,6 +255,41 @@ static void multipoint_test(void)
        perform_cluster_intersecting_test(wkt_inputs_pt, 2, expected_outputs_pt, 1);
 }
 
+static void dbscan_test(void)
+{
+       char* wkt_inputs[] = { "POINT (0 0)", "POINT (-1 0)", "POINT (-1 -0.1)", "POINT (-1 0.1)",
+                                  "POINT (1 0)",
+                                                  "POINT (2 0)", "POINT (3  0)", "POINT ( 3 -0.1)", "POINT ( 3 0.1)" };
+       uint32_t num_geoms = sizeof(wkt_inputs) / sizeof(char*);
+       LWGEOM** geoms = WKTARRAY2LWGEOM(wkt_inputs, num_geoms);
+       UNIONFIND* uf = UF_create(num_geoms);
+       uint32_t* ids;
+       char* in_a_cluster;
+       uint32_t i;
+
+       /* Although POINT (1 0) and POINT (2 0) are within eps distance of each other,
+        * they do not connect the two clusters because POINT (1 0) is not a core point.
+        * See #3572
+        */
+       double eps = 1.01;
+       uint32_t min_points = 5;
+       uint32_t expected_ids[] = { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1 };
+
+       union_dbscan(geoms, num_geoms, uf, eps, min_points, &in_a_cluster);
+       ids = UF_get_collapsed_cluster_ids(uf, in_a_cluster);
+
+       ASSERT_INTARRAY_EQUAL(ids, expected_ids, num_geoms);
+
+       UF_destroy(uf);
+       for (i = 0; i < num_geoms; i++)
+       {
+               lwgeom_free(geoms[i]);
+       }
+       lwfree(geoms);
+       lwfree(in_a_cluster);
+       lwfree(ids);
+}
+
 void geos_cluster_suite_setup(void);
 void geos_cluster_suite_setup(void)
 {
@@ -265,4 +300,5 @@ void geos_cluster_suite_setup(void)
        PG_ADD_TEST(suite, single_input_test);
        PG_ADD_TEST(suite, empty_inputs_test);
        PG_ADD_TEST(suite, multipoint_test);
+       PG_ADD_TEST(suite, dbscan_test);
 }
index ea606d7d347398e3d3f481bd45fd0a732843ac6d..9e245df31dd3edc87fc4f6b5baa4e2cf597bbadf 100644 (file)
@@ -54,5 +54,24 @@ typedef void (*PG_SuiteSetup)(void);
        CU_ASSERT_TRUE(lwgeom_same(o, e)); \
 } while(0);
 
+#define ASSERT_INTARRAY_EQUAL(o, e, n) do { \
+       size_t i = 0; \
+       for (i = 0; i < n; i++) { \
+               if (o[i] != e[i]) { \
+                       fprintf(stderr, "[%s:%d]", __FILE__, __LINE__); \
+                       fprintf(stderr, "\nExpected: ["); \
+                       for (i = 0; i < n; i++) \
+                               fprintf(stderr, " %d", e[i]); \
+                       fprintf(stderr, " ]\nObtained: ["); \
+                       for (i = 0; i < n; i++) \
+                               fprintf(stderr, " %d", o[i]); \
+                       fprintf(stderr, " ]\n"); \
+                       CU_FAIL(); \
+                       break; \
+               } \
+       } \
+       CU_PASS(); \
+} while(0);
+
 /* Utility functions */
 void do_fn_test(LWGEOM* (*transfn)(LWGEOM*), char *input_wkt, char *expected_wkt);
index cb5566fa9cd86ab4cf393a7743efec055e3e1dd3..b4826389295979d1b3cb972786276ed9aa12d992 100644 (file)
@@ -22,10 +22,10 @@ static void test_unionfind_create(void)
        uint32_t expected_initial_ids[] =   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
        uint32_t expected_initial_sizes[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
 
-       CU_ASSERT_EQUAL(10, uf->N);
-       CU_ASSERT_EQUAL(10, uf->num_clusters);
-       CU_ASSERT_EQUAL(0, memcmp(uf->clusters, expected_initial_ids, 10*sizeof(uint32_t)));
-       CU_ASSERT_EQUAL(0, memcmp(uf->cluster_sizes, expected_initial_sizes, 10*sizeof(uint32_t)));
+       ASSERT_INT_EQUAL(uf->N, 10);
+       ASSERT_INT_EQUAL(uf->num_clusters, 10);
+       ASSERT_INTARRAY_EQUAL(uf->clusters, expected_initial_ids, 10);
+       ASSERT_INTARRAY_EQUAL(uf->cluster_sizes, expected_initial_sizes, 10);
 
        UF_destroy(uf);
 }
@@ -42,10 +42,10 @@ static void test_unionfind_union(void)
        uint32_t expected_final_ids[] =   { 0, 2, 2, 2, 4, 5, 6, 0, 0, 9 };
        uint32_t expected_final_sizes[] = { 3, 0, 3, 0, 1, 1, 1, 0, 0, 1 };
 
-       CU_ASSERT_EQUAL(10, uf->N);
-       CU_ASSERT_EQUAL(6, uf->num_clusters);
-       CU_ASSERT_EQUAL(0, memcmp(uf->clusters, expected_final_ids, 10*sizeof(uint32_t)));
-       CU_ASSERT_EQUAL(0, memcmp(uf->cluster_sizes, expected_final_sizes, 10*sizeof(uint32_t)));
+       ASSERT_INT_EQUAL(uf->N, 10);
+       ASSERT_INT_EQUAL(uf->num_clusters, 6);
+       ASSERT_INTARRAY_EQUAL(uf->clusters, expected_final_ids, 10);
+       ASSERT_INTARRAY_EQUAL(uf->cluster_sizes, expected_final_sizes, 10);
 
        UF_destroy(uf);
 }
@@ -140,17 +140,23 @@ static void test_unionfind_collapse_cluster_ids(void)
         * 7 -> 1
         * 8 -> 2
         */
-       uint32_t expected_collapsed_ids[] = { 3, 1, 1, 1, 2, 1, 3, 2, 3, 2 };
-       uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf, 1, 0);
+       uint32_t expected_collapsed_ids[] = { 2, 0, 0, 0, 1, 0, 2, 1, 2, 1 };
+       uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf, NULL);
 
-       CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids, 10*sizeof(uint32_t)));
+       ASSERT_INTARRAY_EQUAL(collapsed_ids, expected_collapsed_ids, 10);
 
        lwfree(collapsed_ids);
 
-       uint32_t expected_collapsed_ids2[] = { 999, 1, 1, 1, 999, 1, 999, 999, 999, 999 };
-       collapsed_ids = UF_get_collapsed_cluster_ids(uf, 4, 999);
+       char is_in_cluster[] = { 0, 1, 1, 1, 0, 1, 0, 0, 0, 0 };
+       uint32_t expected_collapsed_ids2[] = { 8, 0, 0, 0, 7, 0, 8, 7, 8, 7 };
 
-       CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids2, 10*sizeof(uint32_t)));
+       collapsed_ids = UF_get_collapsed_cluster_ids(uf, is_in_cluster);
+       int i;
+       for (i = 0; i < uf->N; i++)
+       { 
+               if (is_in_cluster[i])
+                       ASSERT_INT_EQUAL(expected_collapsed_ids2[i], collapsed_ids[i]);
+       }
 
        lwfree(collapsed_ids);
        UF_destroy(uf);
index d813df09a5135c76274cf96bd7686e1cf3ad8dbf..6684c5442fdefc844f08943aaa5be6ece86e0487 100644 (file)
@@ -43,8 +43,7 @@ GEOSGeometry * LWGEOM_GEOS_buildArea(const GEOSGeometry* geom_in);
 
 int cluster_intersecting(GEOSGeometry** geoms, uint32_t num_geoms, GEOSGeometry*** clusterGeoms, uint32_t* num_clusters);
 int cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters);
-int cluster_dbscan(LWGEOM** geoms, uint32_t num_geoms, double eps, uint32_t min_points, LWGEOM*** clusterGeoms, uint32_t* num_clusters);
-int union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points);
+int union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** is_in_cluster_ret);
 
 POINTARRAY *ptarray_from_GEOSCoordSeq(const GEOSCoordSequence *cs, char want3d);
 
index 505e0333c27dacf9f14cc1f528f66248ba14d9c8..ce45165c6b7333d6306bc317f9cdaa9887435634 100644 (file)
@@ -235,16 +235,6 @@ cluster_intersecting(GEOSGeometry** geoms, uint32_t num_geoms, GEOSGeometry*** c
        return cluster_success;
 }
 
-/** Takes an array of LWGEOM* and constructs an array of LWGEOM*, where each element in the constructed array is a
- *  GeometryCollection representing a set of geometries separated by no more than the specified tolerance. Caller is
- *  responsible for freeing the input array, but not the LWGEOM* items inside it. */
-int
-cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters)
-{
-       int min_points = 1;
-       return cluster_dbscan(geoms, num_geoms, tolerance, min_points, clusterGeoms, num_clusters);
-}
-
 
 static int
 dbscan_update_context(GEOSSTRtree* tree, struct QueryContext* cxt, LWGEOM** geoms, uint32_t p, double eps)
@@ -268,7 +258,7 @@ dbscan_update_context(GEOSSTRtree* tree, struct QueryContext* cxt, LWGEOM** geom
 }
 
 int
-union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points)
+union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** in_a_cluster_ret)
 {
        uint32_t p, i;
        struct STRTree tree;
@@ -279,6 +269,8 @@ union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint
                .items_found_size = 0
        };
        int success = LW_SUCCESS;
+       char* in_a_cluster;
+       char* is_in_core;
 
        if (num_geoms <= 1)
                return LW_SUCCESS;
@@ -290,6 +282,11 @@ union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint
                return LW_FAILURE;
        }
 
+       in_a_cluster = lwalloc(num_geoms * sizeof(char));
+       memset(in_a_cluster, 0, num_geoms * sizeof(char));
+       is_in_core = lwalloc(num_geoms * sizeof(char));
+       memset(is_in_core, 0, num_geoms * sizeof(char));
+
        for (p = 0; p < num_geoms; p++)
        {
                uint32_t num_neighbors = 0;
@@ -313,23 +310,53 @@ union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint
                        }
 
                        if (mindist <= eps)
+                       {
                                neighbors[num_neighbors++] = q;
+                       }
                }
 
+               if (!success)
+                       break;
+
                if (num_neighbors >= min_points)
                {
+                       in_a_cluster[p] = LW_TRUE;
+                       is_in_core[p] = LW_TRUE;
+
                        for (i = 0; i < num_neighbors; i++)
                        {
-                               UF_union(uf, p, neighbors[i]);
+                               uint32_t q = neighbors[i];
+
+                               if (in_a_cluster[q])
+                               {
+                                       /* Can we merge p's cluster with q's cluster?  We can do this only
+                                        * if both p and q are considered _core_ points of their respective
+                                        * clusters.
+                                        */
+                                        if (is_in_core[q])
+                                        {
+                                                UF_union(uf, p, q);
+                                        }
+                               }
+                               else
+                               {
+                                       UF_union(uf, p, q);
+                                       in_a_cluster[q] = LW_TRUE;
+                               }
                        }
                }
 
                lwfree(neighbors);
-
-               if (!success)
-                       break;
        }
 
+       lwfree(is_in_core);
+
+       /* Either pass in_a_cluster to our caller, or free it. */
+       if (in_a_cluster_ret)
+               *in_a_cluster_ret = in_a_cluster;
+       else
+               lwfree(in_a_cluster);
+
        if (cxt.items_found)
                lwfree(cxt.items_found);
 
@@ -337,13 +364,16 @@ union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint
        return success;
 }
 
+/** Takes an array of LWGEOM* and constructs an array of LWGEOM*, where each element in the constructed array is a
+ *  GeometryCollection representing a set of geometries separated by no more than the specified tolerance. Caller is
+ *  responsible for freeing the input array, but not the LWGEOM* items inside it. */
 int
-cluster_dbscan(LWGEOM** geoms, uint32_t num_geoms, double eps, uint32_t min_points, LWGEOM*** clusterGeoms, uint32_t* num_clusters)
+cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters)
 {
        int cluster_success;
        UNIONFIND* uf = UF_create(num_geoms);
 
-       if (union_dbscan(geoms, num_geoms, uf, eps, min_points) == LW_FAILURE)
+       if (union_dbscan(geoms, num_geoms, uf, tolerance, 1, NULL) == LW_FAILURE)
        {
                UF_destroy(uf);
                return LW_FAILURE;
index 4f28c4f783d41acc2627965c6c7fcc9954feae5d..2657ff9926d291d1c61f3cec30dc0a3cdfb8b78f 100644 (file)
@@ -142,25 +142,25 @@ UF_ordered_by_cluster(UNIONFIND* uf)
 }
 
 uint32_t*
-UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id)
+UF_get_collapsed_cluster_ids(UNIONFIND* uf, const char* is_in_cluster)
 {
        uint32_t* ordered_components = UF_ordered_by_cluster(uf);
        uint32_t* new_ids = lwalloc(uf->N * sizeof(uint32_t));
        uint32_t last_old_id, current_new_id, i;
+       char encountered_cluster = LW_FALSE;
 
-       last_old_id = UF_find(uf, ordered_components[0]);
-       current_new_id = 1;
+       current_new_id = 0;
        for (i = 0; i < uf->N; i++)
        {
                uint32_t j = ordered_components[i];
-
-               if (UF_size(uf, j) < min_cluster_size)
-               {
-                       new_ids[j] = noise_cluster_id;
-               }
-               else
+               if (!is_in_cluster || is_in_cluster[j])
                {
                        uint32_t current_old_id = UF_find(uf, j);
+                       if (!encountered_cluster)
+                       {
+                               encountered_cluster = LW_TRUE;
+                               last_old_id = current_old_id;
+                       }
 
                        if (current_old_id != last_old_id)
                                current_new_id++;
index ed36925c790561ea07e22ab384f187f8eb8ae2b2..4c2d3e9cb91d0b0e444148e9318a1399765deb20 100644 (file)
@@ -56,9 +56,9 @@ void UF_union(UNIONFIND* uf, uint32_t i, uint32_t j);
 uint32_t* UF_ordered_by_cluster(UNIONFIND* uf);
 
 /* Replace the cluster ids in a UNIONFIND with sequential ids starting at one. 
- * If min_cluster_size is greater than 1, any clusters with fewer than
- * min_cluster_size items will be assigned to noise_cluster_id.
+ * If is_in_cluster array is provided, it will be used to skip any indexes
+ * that are not in a cluster.
  * */
-uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id);
+uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf, const char* is_in_cluster);
 
 #endif
index 8018cc926babdfba9239acef27c5003405924520..8ace1e6d7d43a3eebd6d75fff11a800ec2b9a835 100644 (file)
@@ -3191,7 +3191,6 @@ Datum cluster_within_distance_garray(PG_FUNCTION_ARGS)
        LWGEOM** lw_inputs;
        LWGEOM** lw_results;
        double tolerance;
-       int min_points;
        int srid=SRID_UNKNOWN;
 
        /* Parameters used to construct a result array */
@@ -3212,17 +3211,6 @@ Datum cluster_within_distance_garray(PG_FUNCTION_ARGS)
                PG_RETURN_NULL();
        }
 
-       if (PG_NARGS() < 3)
-               min_points = 1;
-       else
-               min_points = PG_GETARG_INT32(2);
-
-       if (min_points < 1)
-       {
-               lwpgerror("min_points must be a positive number.");
-               PG_RETURN_NULL();
-       }
-
        nelems = array_nelems_not_null(array);
 
        POSTGIS_DEBUGF(3, "cluster_within_distance_garray: number of non-null elements: %d", nelems);
@@ -3240,7 +3228,7 @@ Datum cluster_within_distance_garray(PG_FUNCTION_ARGS)
                PG_RETURN_NULL();
        }
 
-       if (cluster_dbscan(lw_inputs, nelems, tolerance, min_points, &lw_results, &nclusters) != LW_SUCCESS)
+       if (cluster_within_distance(lw_inputs, nelems, tolerance, &lw_results, &nclusters) != LW_SUCCESS)
        {
                elog(ERROR, "cluster_within: Error performing clustering");
                PG_RETURN_NULL();
index 65d78924210704c0ee0d049cd140109d3aa82645..09920871b20610bbcf272391d687cef72ecde388 100644 (file)
@@ -92,6 +92,7 @@ Datum ST_ClusterDBSCAN(PG_FUNCTION_ARGS)
                uint32_t i;
                uint32_t* result_ids;
                LWGEOM** geoms;
+               char* is_in_cluster;
                UNIONFIND* uf;
                bool tolerance_is_null;
                bool minpoints_is_null;
@@ -127,7 +128,7 @@ Datum ST_ClusterDBSCAN(PG_FUNCTION_ARGS)
                        }
                }
 
-               if (union_dbscan(geoms, ngeoms, uf, tolerance, minpoints) == LW_SUCCESS)
+               if (union_dbscan(geoms, ngeoms, uf, tolerance, minpoints, &is_in_cluster) == LW_SUCCESS)
                        context->is_error = LW_FALSE;
 
                for (i = 0; i < ngeoms; i++)
@@ -139,26 +140,21 @@ Datum ST_ClusterDBSCAN(PG_FUNCTION_ARGS)
                if (context->is_error)
                {
                        UF_destroy(uf);
+                       lwfree(is_in_cluster);
                        lwpgerror("Error during clustering");
                        PG_RETURN_NULL();
                }
 
-               result_ids = UF_get_collapsed_cluster_ids(uf, minpoints, 0);
+               result_ids = UF_get_collapsed_cluster_ids(uf, is_in_cluster);
                for (i = 0; i < ngeoms; i++)
                {
-                       if (result_ids[i] == 0)
+                       if (!is_in_cluster[i])
                        {
                                context->cluster_assignments[i].is_null = LW_TRUE;
                        }
                        else
                        {
-                               /* We overloaded the zero cluster ID above to tag "noise" geometries
-                                * that are not part of any cluster.  Now that those have been
-                                * properly converted into NULLs, we need to subtract one from 
-                                * the collapsed cluster IDs so that we get a zero-based sequence, 
-                                * consistent with our good friend ST_ClusterKMeans.
-                                */
-                               context->cluster_assignments[i].cluster_id = result_ids[i] - 1;
+                               context->cluster_assignments[i].cluster_id = result_ids[i];
                        }
                }
 
index 962a46c775192b8148becb2e9c224c51090ea49e..28f498f3c41ba0dd61638d24af8430f3159bd971 100644 (file)
@@ -24,6 +24,6 @@ t102|6|
 t103|1|
 t103|2|
 t103|3|
-t103|4|1
-t103|5|1
-t103|6|1
+t103|4|0
+t103|5|0
+t103|6|0