From cc21ff86777fe3a7efa5b4be34c78c95dc3ef9fe Mon Sep 17 00:00:00 2001 From: Daniel Baston Date: Thu, 2 Jun 2016 01:24:52 +0000 Subject: [PATCH] #3567, Make ST_ClusterDBSCAN return a NULL id for inputs not in any cluster git-svn-id: http://svn.osgeo.org/postgis/trunk@14928 b70326c6-7e19-0410-871a-916f4a2858ee --- doc/reference_measure.xml | 5 +++-- liblwgeom/cunit/cu_unionfind.c | 25 +++++++++++++++++++++++-- liblwgeom/lwunionfind.c | 28 +++++++++++++++++++++------- liblwgeom/lwunionfind.h | 12 +++++++++--- postgis/lwgeom_window.c | 17 +++++++++++++++-- regress/cluster_expected | 24 ++++++++++++------------ 6 files changed, 83 insertions(+), 28 deletions(-) diff --git a/doc/reference_measure.xml b/doc/reference_measure.xml index bd33c9d60..98dd6f591 100644 --- a/doc/reference_measure.xml +++ b/doc/reference_measure.xml @@ -1082,8 +1082,9 @@ SELECT ST_AsText( Density-based spatial clustering of applications with noise (DBSCAN) algorithm. An input geometry will be added to a cluster if it is within eps distance of at least - minpoints other input geometries. Unlike , it does not require to specify the number of - clusters for each dataset, but instead uses the desired input distance eps at optimal cluster size for each cluster. + minpoints other input geometries. Unlike , it does not require the number of + clusters to be specified, but instead uses the desired distance (eps) and density(minpoints) parameters + to construct each cluster. Input geometries that do not meet the criteria to join any other cluster will be assigned a cluster number of NULL. Availability: 2.3.0 - requires GEOS diff --git a/liblwgeom/cunit/cu_unionfind.c b/liblwgeom/cunit/cu_unionfind.c index 3491c30d9..cb5566fa9 100644 --- a/liblwgeom/cunit/cu_unionfind.c +++ b/liblwgeom/cunit/cu_unionfind.c @@ -125,14 +125,35 @@ static void test_unionfind_collapse_cluster_ids(void) uf->clusters[8] = 8; uf->clusters[9] = 7; + uf->cluster_sizes[0] = 3; + uf->cluster_sizes[1] = 4; + uf->cluster_sizes[2] = 4; + uf->cluster_sizes[3] = 4; + uf->cluster_sizes[4] = 3; + uf->cluster_sizes[5] = 4; + uf->cluster_sizes[6] = 3; + uf->cluster_sizes[7] = 3; + uf->cluster_sizes[8] = 3; + uf->cluster_sizes[9] = 3; + /* 5 -> 0 * 7 -> 1 * 8 -> 2 */ - uint32_t expected_collapsed_ids[] = { 2, 0, 0, 0, 1, 0, 2, 1, 2, 1 }; - uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf); + uint32_t expected_collapsed_ids[] = { 3, 1, 1, 1, 2, 1, 3, 2, 3, 2 }; + uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf, 1, 0); CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids, 10*sizeof(uint32_t))); + + lwfree(collapsed_ids); + + uint32_t expected_collapsed_ids2[] = { 999, 1, 1, 1, 999, 1, 999, 999, 999, 999 }; + collapsed_ids = UF_get_collapsed_cluster_ids(uf, 4, 999); + + CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids2, 10*sizeof(uint32_t))); + + lwfree(collapsed_ids); + UF_destroy(uf); } void unionfind_suite_setup(void); diff --git a/liblwgeom/lwunionfind.c b/liblwgeom/lwunionfind.c index 50bc31dbd..4f28c4f78 100644 --- a/liblwgeom/lwunionfind.c +++ b/liblwgeom/lwunionfind.c @@ -74,6 +74,12 @@ UF_find (UNIONFIND* uf, uint32_t i) return i; } +uint32_t +UF_size (UNIONFIND* uf, uint32_t i) +{ + return uf->cluster_sizes[UF_find(uf, i)]; +} + void UF_union(UNIONFIND* uf, uint32_t i, uint32_t j) { @@ -136,24 +142,32 @@ UF_ordered_by_cluster(UNIONFIND* uf) } uint32_t* -UF_get_collapsed_cluster_ids(UNIONFIND* uf) +UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id) { uint32_t* ordered_components = UF_ordered_by_cluster(uf); uint32_t* new_ids = lwalloc(uf->N * sizeof(uint32_t)); uint32_t last_old_id, current_new_id, i; last_old_id = UF_find(uf, ordered_components[0]); - current_new_id = 0; + current_new_id = 1; for (i = 0; i < uf->N; i++) { uint32_t j = ordered_components[i]; - uint32_t current_old_id = UF_find(uf, j); - if (current_old_id != last_old_id) - current_new_id++; + if (UF_size(uf, j) < min_cluster_size) + { + new_ids[j] = noise_cluster_id; + } + else + { + uint32_t current_old_id = UF_find(uf, j); + + if (current_old_id != last_old_id) + current_new_id++; - new_ids[j] = current_new_id; - last_old_id = current_old_id; + new_ids[j] = current_new_id; + last_old_id = current_old_id; + } } lwfree(ordered_components); diff --git a/liblwgeom/lwunionfind.h b/liblwgeom/lwunionfind.h index f644e2ba6..ed36925c7 100644 --- a/liblwgeom/lwunionfind.h +++ b/liblwgeom/lwunionfind.h @@ -45,14 +45,20 @@ void UF_destroy(UNIONFIND* uf); /* Identify the cluster id associated with specified component id */ uint32_t UF_find(UNIONFIND* uf, uint32_t i); -/* Merge the clusters that contain the two specified components ids */ +/* Get the size of the cluster associated with the specified component id */ +uint32_t UF_size(UNIONFIND* uf, uint32_t i); + +/* Merge the clusters that contain the two specified component ids */ void UF_union(UNIONFIND* uf, uint32_t i, uint32_t j); /* Return an array of component ids, where components that are in the * same cluster are contiguous in the array */ uint32_t* UF_ordered_by_cluster(UNIONFIND* uf); -/* Replace the cluster ids in a UNIONFIND with sequential ids starting at zero. */ -uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf); +/* Replace the cluster ids in a UNIONFIND with sequential ids starting at one. + * If min_cluster_size is greater than 1, any clusters with fewer than + * min_cluster_size items will be assigned to noise_cluster_id. + * */ +uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id); #endif diff --git a/postgis/lwgeom_window.c b/postgis/lwgeom_window.c index ecdece076..65d789242 100644 --- a/postgis/lwgeom_window.c +++ b/postgis/lwgeom_window.c @@ -143,10 +143,23 @@ Datum ST_ClusterDBSCAN(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - result_ids = UF_get_collapsed_cluster_ids(uf); + result_ids = UF_get_collapsed_cluster_ids(uf, minpoints, 0); for (i = 0; i < ngeoms; i++) { - context->cluster_assignments[i].cluster_id = result_ids[i]; + if (result_ids[i] == 0) + { + context->cluster_assignments[i].is_null = LW_TRUE; + } + else + { + /* We overloaded the zero cluster ID above to tag "noise" geometries + * that are not part of any cluster. Now that those have been + * properly converted into NULLs, we need to subtract one from + * the collapsed cluster IDs so that we get a zero-based sequence, + * consistent with our good friend ST_ClusterKMeans. + */ + context->cluster_assignments[i].cluster_id = result_ids[i] - 1; + } } lwfree(result_ids); diff --git a/regress/cluster_expected b/regress/cluster_expected index 4f7d55781..962a46c77 100644 --- a/regress/cluster_expected +++ b/regress/cluster_expected @@ -15,15 +15,15 @@ t101|3|0 t101|4|1 t101|5|1 t101|6|1 -t102|1|0 -t102|2|1 -t102|3|2 -t102|4|3 -t102|5|4 -t102|6|5 -t103|1|0 -t103|2|1 -t103|3|2 -t103|4|3 -t103|5|3 -t103|6|3 +t102|1| +t102|2| +t102|3| +t102|4| +t102|5| +t102|6| +t103|1| +t103|2| +t103|3| +t103|4|1 +t103|5|1 +t103|6|1 -- 2.50.1