#3567, Make ST_ClusterDBSCAN return a NULL id for inputs not in any cluster

author Daniel Baston <dbaston@gmail.com>

Thu, 2 Jun 2016 01:24:52 +0000 (01:24 +0000)

committer Daniel Baston <dbaston@gmail.com>

Thu, 2 Jun 2016 01:24:52 +0000 (01:24 +0000)
author Daniel Baston <dbaston@gmail.com>
Thu, 2 Jun 2016 01:24:52 +0000 (01:24 +0000)
committer Daniel Baston <dbaston@gmail.com>
Thu, 2 Jun 2016 01:24:52 +0000 (01:24 +0000)
diff --git a/doc/reference_measure.xml b/doc/reference_measure.xml

index bd33c9d608cace47c6ffe3fa92e8c1fad7b9269b..98dd6f591a7bc78135bd91d4a7bc880832005eaf 100644 (file)
--- a/doc/reference_measure.xml
+++ b/doc/reference_measure.xml
@@ -1082,8 +1082,9 @@ SELECT ST_AsText(
            <ulink url="https://en.wikipedia.org/wiki/DBSCAN">Density-based spatial clustering of applications with noise (DBSCAN)</ulink> 
            algorithm.  An input geometry will be added to a cluster if it is 
            within <varname>eps</varname> distance of at least
-          <varname>minpoints</varname> other input geometries.  Unlike <xref linkend="ST_ClusterKMeans" />, it does not require to specify the number of
-          clusters for each dataset, but instead uses the desired input distance <varname>eps</varname> at optimal cluster size for each cluster.
+          <varname>minpoints</varname> other input geometries.  Unlike <xref linkend="ST_ClusterKMeans" />, it does not require the number of
+          clusters to be specified, but instead uses the desired distance (<varname>eps</varname>) and density(<varname>minpoints</varname>) parameters 
+          to construct each cluster.  Input geometries that do not meet the criteria to join any other cluster will be assigned a cluster number of NULL.
        </para>
        <para>Availability: 2.3.0 - requires GEOS </para>
      </refsection>
diff --git a/liblwgeom/cunit/cu_unionfind.c b/liblwgeom/cunit/cu_unionfind.c

index 3491c30d9fffb3181b6c75d7fb238e0d4e482670..cb5566fa9cd86ab4cf393a7743efec055e3e1dd3 100644 (file)
--- a/liblwgeom/cunit/cu_unionfind.c
+++ b/liblwgeom/cunit/cu_unionfind.c
@@ -125,14 +125,35 @@ static void test_unionfind_collapse_cluster_ids(void)
         uf->clusters[8] = 8;
         uf->clusters[9] = 7;
  
+       uf->cluster_sizes[0] = 3;
+       uf->cluster_sizes[1] = 4;
+       uf->cluster_sizes[2] = 4;
+       uf->cluster_sizes[3] = 4;
+       uf->cluster_sizes[4] = 3;
+       uf->cluster_sizes[5] = 4;
+       uf->cluster_sizes[6] = 3;
+       uf->cluster_sizes[7] = 3;
+       uf->cluster_sizes[8] = 3;
+       uf->cluster_sizes[9] = 3;
+
         /* 5 -> 0
          * 7 -> 1
          * 8 -> 2
          */
-       uint32_t expected_collapsed_ids[] = { 2, 0, 0, 0, 1, 0, 2, 1, 2, 1 };
-       uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf);
+       uint32_t expected_collapsed_ids[] = { 3, 1, 1, 1, 2, 1, 3, 2, 3, 2 };
+       uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf, 1, 0);
  
         CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids, 10*sizeof(uint32_t)));
+
+       lwfree(collapsed_ids);
+
+       uint32_t expected_collapsed_ids2[] = { 999, 1, 1, 1, 999, 1, 999, 999, 999, 999 };
+       collapsed_ids = UF_get_collapsed_cluster_ids(uf, 4, 999);
+
+       CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids2, 10*sizeof(uint32_t)));
+
+       lwfree(collapsed_ids);
+       UF_destroy(uf);
  }
  
  void unionfind_suite_setup(void);
diff --git a/liblwgeom/lwunionfind.c b/liblwgeom/lwunionfind.c

index 50bc31dbd29d09224221d4088bfc4e62df9d6a61..4f28c4f783d41acc2627965c6c7fcc9954feae5d 100644 (file)
--- a/liblwgeom/lwunionfind.c
+++ b/liblwgeom/lwunionfind.c
@@ -74,6 +74,12 @@ UF_find (UNIONFIND* uf, uint32_t i)
         return i;
  }
  
+uint32_t
+UF_size (UNIONFIND* uf, uint32_t i)
+{
+    return uf->cluster_sizes[UF_find(uf, i)];
+}
+
  void
  UF_union(UNIONFIND* uf, uint32_t i, uint32_t j)
  {
@@ -136,24 +142,32 @@ UF_ordered_by_cluster(UNIONFIND* uf)
  }
  
  uint32_t*
-UF_get_collapsed_cluster_ids(UNIONFIND* uf)
+UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id)
  {
         uint32_t* ordered_components = UF_ordered_by_cluster(uf);
         uint32_t* new_ids = lwalloc(uf->N * sizeof(uint32_t));
         uint32_t last_old_id, current_new_id, i;
  
         last_old_id = UF_find(uf, ordered_components[0]);
-       current_new_id = 0;
+       current_new_id = 1;
         for (i = 0; i < uf->N; i++)
         {
                 uint32_t j = ordered_components[i];
-               uint32_t current_old_id = UF_find(uf, j);
  
-               if (current_old_id != last_old_id)
-                       current_new_id++;
+               if (UF_size(uf, j) < min_cluster_size)
+               {
+                       new_ids[j] = noise_cluster_id;
+               }
+               else
+               {
+                       uint32_t current_old_id = UF_find(uf, j);
+
+                       if (current_old_id != last_old_id)
+                               current_new_id++;
  
-               new_ids[j] = current_new_id;
-               last_old_id = current_old_id;
+                       new_ids[j] = current_new_id;
+                       last_old_id = current_old_id;
+               }
         }
  
         lwfree(ordered_components);
diff --git a/liblwgeom/lwunionfind.h b/liblwgeom/lwunionfind.h

index f644e2ba65bc0c9ba07bb9269825058a08e0a45f..ed36925c790561ea07e22ab384f187f8eb8ae2b2 100644 (file)
--- a/liblwgeom/lwunionfind.h
+++ b/liblwgeom/lwunionfind.h
@@ -45,14 +45,20 @@ void UF_destroy(UNIONFIND* uf);
  /* Identify the cluster id associated with specified component id */
  uint32_t UF_find(UNIONFIND* uf, uint32_t i);
  
-/* Merge the clusters that contain the two specified components ids */
+/* Get the size of the cluster associated with the specified component id */
+uint32_t UF_size(UNIONFIND* uf, uint32_t i);
+
+/* Merge the clusters that contain the two specified component ids */
  void UF_union(UNIONFIND* uf, uint32_t i, uint32_t j);
  
  /* Return an array of component ids, where components that are in the
   * same cluster are contiguous in the array */
  uint32_t* UF_ordered_by_cluster(UNIONFIND* uf);
  
-/* Replace the cluster ids in a UNIONFIND with sequential ids starting at zero. */
-uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf);
+/* Replace the cluster ids in a UNIONFIND with sequential ids starting at one. 
+ * If min_cluster_size is greater than 1, any clusters with fewer than
+ * min_cluster_size items will be assigned to noise_cluster_id.
+ * */
+uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id);
  
  #endif
diff --git a/postgis/lwgeom_window.c b/postgis/lwgeom_window.c

index ecdece076add3bbc686654579d045395e6e2a748..65d78924210704c0ee0d049cd140109d3aa82645 100644 (file)
--- a/postgis/lwgeom_window.c
+++ b/postgis/lwgeom_window.c
@@ -143,10 +143,23 @@ Datum ST_ClusterDBSCAN(PG_FUNCTION_ARGS)
                         PG_RETURN_NULL();
                 }
  
-               result_ids = UF_get_collapsed_cluster_ids(uf);
+               result_ids = UF_get_collapsed_cluster_ids(uf, minpoints, 0);
                 for (i = 0; i < ngeoms; i++)
                 {
-                       context->cluster_assignments[i].cluster_id = result_ids[i];
+                       if (result_ids[i] == 0)
+                       {
+                               context->cluster_assignments[i].is_null = LW_TRUE;
+                       }
+                       else
+                       {
+                               /* We overloaded the zero cluster ID above to tag "noise" geometries
+                                * that are not part of any cluster.  Now that those have been
+                                * properly converted into NULLs, we need to subtract one from 
+                                * the collapsed cluster IDs so that we get a zero-based sequence, 
+                                * consistent with our good friend ST_ClusterKMeans.
+                                */
+                               context->cluster_assignments[i].cluster_id = result_ids[i] - 1;
+                       }
                 }
  
                 lwfree(result_ids);
diff --git a/regress/cluster_expected b/regress/cluster_expected

index 4f7d5578190d37051d74493286f863925bdaaa3c..962a46c775192b8148becb2e9c224c51090ea49e 100644 (file)
--- a/regress/cluster_expected
+++ b/regress/cluster_expected
@@ -15,15 +15,15 @@ t101|3|0
  t101|4|1
  t101|5|1
  t101|6|1
-t102|1|0
-t102|2|1
-t102|3|2
-t102|4|3
-t102|5|4
-t102|6|5
-t103|1|0
-t103|2|1
-t103|3|2
-t103|4|3
-t103|5|3
-t103|6|3
+t102|1|
+t102|2|
+t102|3|
+t102|4|
+t102|5|
+t102|6|
+t103|1|
+t103|2|
+t103|3|
+t103|4|1
+t103|5|1
+t103|6|1
author	Daniel Baston <dbaston@gmail.com>
	Thu, 2 Jun 2016 01:24:52 +0000 (01:24 +0000)
committer	Daniel Baston <dbaston@gmail.com>
	Thu, 2 Jun 2016 01:24:52 +0000 (01:24 +0000)
doc/reference_measure.xml		patch \| blob \| history
liblwgeom/cunit/cu_unionfind.c		patch \| blob \| history
liblwgeom/lwunionfind.c		patch \| blob \| history
liblwgeom/lwunionfind.h		patch \| blob \| history
postgis/lwgeom_window.c		patch \| blob \| history
regress/cluster_expected		patch \| blob \| history