]> granicus.if.org Git - postgis/commitdiff
#3567, Make ST_ClusterDBSCAN return a NULL id for inputs not in any cluster
authorDaniel Baston <dbaston@gmail.com>
Thu, 2 Jun 2016 01:24:52 +0000 (01:24 +0000)
committerDaniel Baston <dbaston@gmail.com>
Thu, 2 Jun 2016 01:24:52 +0000 (01:24 +0000)
git-svn-id: http://svn.osgeo.org/postgis/trunk@14928 b70326c6-7e19-0410-871a-916f4a2858ee

doc/reference_measure.xml
liblwgeom/cunit/cu_unionfind.c
liblwgeom/lwunionfind.c
liblwgeom/lwunionfind.h
postgis/lwgeom_window.c
regress/cluster_expected

index bd33c9d608cace47c6ffe3fa92e8c1fad7b9269b..98dd6f591a7bc78135bd91d4a7bc880832005eaf 100644 (file)
@@ -1082,8 +1082,9 @@ SELECT ST_AsText(
           <ulink url="https://en.wikipedia.org/wiki/DBSCAN">Density-based spatial clustering of applications with noise (DBSCAN)</ulink> 
           algorithm.  An input geometry will be added to a cluster if it is 
           within <varname>eps</varname> distance of at least
-          <varname>minpoints</varname> other input geometries.  Unlike <xref linkend="ST_ClusterKMeans" />, it does not require to specify the number of
-          clusters for each dataset, but instead uses the desired input distance <varname>eps</varname> at optimal cluster size for each cluster.
+          <varname>minpoints</varname> other input geometries.  Unlike <xref linkend="ST_ClusterKMeans" />, it does not require the number of
+          clusters to be specified, but instead uses the desired distance (<varname>eps</varname>) and density(<varname>minpoints</varname>) parameters 
+          to construct each cluster.  Input geometries that do not meet the criteria to join any other cluster will be assigned a cluster number of NULL.
       </para>
       <para>Availability: 2.3.0 - requires GEOS </para>
     </refsection>
index 3491c30d9fffb3181b6c75d7fb238e0d4e482670..cb5566fa9cd86ab4cf393a7743efec055e3e1dd3 100644 (file)
@@ -125,14 +125,35 @@ static void test_unionfind_collapse_cluster_ids(void)
        uf->clusters[8] = 8;
        uf->clusters[9] = 7;
 
+       uf->cluster_sizes[0] = 3;
+       uf->cluster_sizes[1] = 4;
+       uf->cluster_sizes[2] = 4;
+       uf->cluster_sizes[3] = 4;
+       uf->cluster_sizes[4] = 3;
+       uf->cluster_sizes[5] = 4;
+       uf->cluster_sizes[6] = 3;
+       uf->cluster_sizes[7] = 3;
+       uf->cluster_sizes[8] = 3;
+       uf->cluster_sizes[9] = 3;
+
        /* 5 -> 0
         * 7 -> 1
         * 8 -> 2
         */
-       uint32_t expected_collapsed_ids[] = { 2, 0, 0, 0, 1, 0, 2, 1, 2, 1 };
-       uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf);
+       uint32_t expected_collapsed_ids[] = { 3, 1, 1, 1, 2, 1, 3, 2, 3, 2 };
+       uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf, 1, 0);
 
        CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids, 10*sizeof(uint32_t)));
+
+       lwfree(collapsed_ids);
+
+       uint32_t expected_collapsed_ids2[] = { 999, 1, 1, 1, 999, 1, 999, 999, 999, 999 };
+       collapsed_ids = UF_get_collapsed_cluster_ids(uf, 4, 999);
+
+       CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids2, 10*sizeof(uint32_t)));
+
+       lwfree(collapsed_ids);
+       UF_destroy(uf);
 }
 
 void unionfind_suite_setup(void);
index 50bc31dbd29d09224221d4088bfc4e62df9d6a61..4f28c4f783d41acc2627965c6c7fcc9954feae5d 100644 (file)
@@ -74,6 +74,12 @@ UF_find (UNIONFIND* uf, uint32_t i)
        return i;
 }
 
+uint32_t
+UF_size (UNIONFIND* uf, uint32_t i)
+{
+    return uf->cluster_sizes[UF_find(uf, i)];
+}
+
 void
 UF_union(UNIONFIND* uf, uint32_t i, uint32_t j)
 {
@@ -136,24 +142,32 @@ UF_ordered_by_cluster(UNIONFIND* uf)
 }
 
 uint32_t*
-UF_get_collapsed_cluster_ids(UNIONFIND* uf)
+UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id)
 {
        uint32_t* ordered_components = UF_ordered_by_cluster(uf);
        uint32_t* new_ids = lwalloc(uf->N * sizeof(uint32_t));
        uint32_t last_old_id, current_new_id, i;
 
        last_old_id = UF_find(uf, ordered_components[0]);
-       current_new_id = 0;
+       current_new_id = 1;
        for (i = 0; i < uf->N; i++)
        {
                uint32_t j = ordered_components[i];
-               uint32_t current_old_id = UF_find(uf, j);
 
-               if (current_old_id != last_old_id)
-                       current_new_id++;
+               if (UF_size(uf, j) < min_cluster_size)
+               {
+                       new_ids[j] = noise_cluster_id;
+               }
+               else
+               {
+                       uint32_t current_old_id = UF_find(uf, j);
+
+                       if (current_old_id != last_old_id)
+                               current_new_id++;
 
-               new_ids[j] = current_new_id;
-               last_old_id = current_old_id;
+                       new_ids[j] = current_new_id;
+                       last_old_id = current_old_id;
+               }
        }
 
        lwfree(ordered_components);
index f644e2ba65bc0c9ba07bb9269825058a08e0a45f..ed36925c790561ea07e22ab384f187f8eb8ae2b2 100644 (file)
@@ -45,14 +45,20 @@ void UF_destroy(UNIONFIND* uf);
 /* Identify the cluster id associated with specified component id */
 uint32_t UF_find(UNIONFIND* uf, uint32_t i);
 
-/* Merge the clusters that contain the two specified components ids */
+/* Get the size of the cluster associated with the specified component id */
+uint32_t UF_size(UNIONFIND* uf, uint32_t i);
+
+/* Merge the clusters that contain the two specified component ids */
 void UF_union(UNIONFIND* uf, uint32_t i, uint32_t j);
 
 /* Return an array of component ids, where components that are in the
  * same cluster are contiguous in the array */
 uint32_t* UF_ordered_by_cluster(UNIONFIND* uf);
 
-/* Replace the cluster ids in a UNIONFIND with sequential ids starting at zero. */
-uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf);
+/* Replace the cluster ids in a UNIONFIND with sequential ids starting at one. 
+ * If min_cluster_size is greater than 1, any clusters with fewer than
+ * min_cluster_size items will be assigned to noise_cluster_id.
+ * */
+uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id);
 
 #endif
index ecdece076add3bbc686654579d045395e6e2a748..65d78924210704c0ee0d049cd140109d3aa82645 100644 (file)
@@ -143,10 +143,23 @@ Datum ST_ClusterDBSCAN(PG_FUNCTION_ARGS)
                        PG_RETURN_NULL();
                }
 
-               result_ids = UF_get_collapsed_cluster_ids(uf);
+               result_ids = UF_get_collapsed_cluster_ids(uf, minpoints, 0);
                for (i = 0; i < ngeoms; i++)
                {
-                       context->cluster_assignments[i].cluster_id = result_ids[i];
+                       if (result_ids[i] == 0)
+                       {
+                               context->cluster_assignments[i].is_null = LW_TRUE;
+                       }
+                       else
+                       {
+                               /* We overloaded the zero cluster ID above to tag "noise" geometries
+                                * that are not part of any cluster.  Now that those have been
+                                * properly converted into NULLs, we need to subtract one from 
+                                * the collapsed cluster IDs so that we get a zero-based sequence, 
+                                * consistent with our good friend ST_ClusterKMeans.
+                                */
+                               context->cluster_assignments[i].cluster_id = result_ids[i] - 1;
+                       }
                }
 
                lwfree(result_ids);
index 4f7d5578190d37051d74493286f863925bdaaa3c..962a46c775192b8148becb2e9c224c51090ea49e 100644 (file)
@@ -15,15 +15,15 @@ t101|3|0
 t101|4|1
 t101|5|1
 t101|6|1
-t102|1|0
-t102|2|1
-t102|3|2
-t102|4|3
-t102|5|4
-t102|6|5
-t103|1|0
-t103|2|1
-t103|3|2
-t103|4|3
-t103|5|3
-t103|6|3
+t102|1|
+t102|2|
+t102|3|
+t102|4|
+t102|5|
+t102|6|
+t103|1|
+t103|2|
+t103|3|
+t103|4|1
+t103|5|1
+t103|6|1