use hdbscan::{Center, DistanceMetric, Hdbscan, HdbscanError, HdbscanHyperParams, NnAlgorithm};
use num_traits::Float;
use std::collections::HashSet;
type ClusterFn = fn(&Hdbscan<f32>) -> Result<Vec<i32>, HdbscanError>;
pub(crate) fn test_cluster(cluster_fn: ClusterFn) {
let data = cluster_test_data();
let clusterer = Hdbscan::default_hyper_params(&data);
let result = cluster_fn(&clusterer).unwrap();
assert_eq!(1, result[..5].iter().collect::<HashSet<_>>().len());
assert_eq!(1, result[5..10].iter().collect::<HashSet<_>>().len());
assert_eq!(-1, result[10]);
}
pub(crate) fn test_builder_cluster(cluster_fn: ClusterFn) {
let data = vec![
vec![1.3, 1.1],
vec![1.3, 1.2],
vec![1.2, 1.2],
vec![1.0, 1.1],
vec![0.9, 1.0],
vec![0.9, 1.0],
vec![3.7, 4.0],
];
let hyper_params = HdbscanHyperParams::builder()
.min_cluster_size(3)
.min_samples(2)
.dist_metric(DistanceMetric::Manhattan)
.nn_algorithm(NnAlgorithm::BruteForce)
.build();
let clusterer = Hdbscan::new(&data, hyper_params);
let result = cluster_fn(&clusterer).unwrap();
assert_eq!(1, result[..3].iter().collect::<HashSet<_>>().len());
assert_eq!(1, result[3..6].iter().collect::<HashSet<_>>().len());
assert_eq!(-1, result[6]);
}
pub(crate) fn test_single_cluster(cluster_fn: ClusterFn) {
let data = vec![
vec![1.1, 1.1],
vec![1.2, 1.1],
vec![1.3, 1.2],
vec![1.1, 1.3],
vec![1.2, 1.2],
vec![3.0, 3.0],
];
let hp = HdbscanHyperParams::builder()
.nn_algorithm(NnAlgorithm::BruteForce)
.allow_single_cluster(true)
.min_cluster_size(4)
.min_samples(4)
.build();
let clusterer = Hdbscan::new(&data, hp);
let result = cluster_fn(&clusterer).unwrap();
let unique_clusters: HashSet<_> = result.iter().filter(|&&x| x != -1).collect();
assert_eq!(1, unique_clusters.len());
let noise_points: Vec<_> = result.iter().filter(|&&x| x == -1).collect();
assert_eq!(1, noise_points.len());
}
pub(crate) fn test_single_cluster_epsilon_search(cluster_fn: ClusterFn) {
let data = vec![
vec![1.1, 1.1],
vec![1.2, 1.1],
vec![1.3, 1.2],
vec![2.1, 1.3],
vec![2.2, 1.2],
vec![2.0, 1.2],
vec![3.0, 3.0],
];
let hp = HdbscanHyperParams::builder().min_cluster_size(3).build();
let clusterer = Hdbscan::new(&data, hp);
let result = cluster_fn(&clusterer).unwrap();
let unique_clusters = result
.iter()
.filter(|&&label| label != -1)
.collect::<HashSet<_>>();
assert_eq!(2, unique_clusters.len());
let n_noise = result.iter().filter(|&&label| label == -1).count();
assert_eq!(1, n_noise);
let hp = HdbscanHyperParams::builder()
.allow_single_cluster(true)
.min_cluster_size(3)
.epsilon(1.2)
.build();
let clusterer = Hdbscan::new(&data, hp);
let result = cluster_fn(&clusterer).unwrap();
let unique_clusters = result
.iter()
.filter(|&&label| label != -1)
.collect::<HashSet<_>>();
assert_eq!(1, unique_clusters.len());
let n_noise = result.iter().filter(|&&label| label == -1).count();
assert_eq!(1, n_noise);
}
pub(crate) fn test_single_root_cluster_only_epsilon_search(cluster_fn: ClusterFn) {
let data = vec![
vec![1.1, 1.1],
vec![1.2, 1.1],
vec![1.3, 1.2],
vec![3.0, 3.0],
];
let hp = HdbscanHyperParams::builder()
.allow_single_cluster(true)
.min_cluster_size(3)
.epsilon(1.2)
.build();
let clusterer = Hdbscan::new(&data, hp);
let result = cluster_fn(&clusterer).unwrap();
let unique_clusters = result
.iter()
.filter(|&&label| label != -1)
.collect::<HashSet<_>>();
assert_eq!(1, unique_clusters.len());
let n_noise = result.iter().filter(|&&label| label == -1).count();
assert_eq!(1, n_noise);
}
pub(crate) fn test_empty_data(cluster_fn: ClusterFn) {
let data: Vec<Vec<f32>> = Vec::new();
let clusterer = Hdbscan::default_hyper_params(&data);
let result = cluster_fn(&clusterer);
assert!(matches!(result, Err(HdbscanError::EmptyDataset)));
}
pub(crate) fn test_non_finite_coordinate(cluster_fn: ClusterFn) {
let data = vec![vec![1.5, f32::infinity()]];
let clusterer = Hdbscan::default_hyper_params(&data);
let result = cluster_fn(&clusterer);
assert!(matches!(result, Err(HdbscanError::NonFiniteCoordinate(..))));
}
pub(crate) fn test_mismatched_dimensions(cluster_fn: ClusterFn) {
let data = vec![vec![1.5, 2.2], vec![1.0, 1.1], vec![1.2]];
let clusterer = Hdbscan::default_hyper_params(&data);
let result = cluster_fn(&clusterer);
assert!(matches!(result, Err(HdbscanError::WrongDimension(..))));
}
pub(crate) fn test_calc_centroids(cluster_fn: ClusterFn) {
let data = cluster_test_data();
let clusterer = Hdbscan::default_hyper_params(&data);
let labels = cluster_fn(&clusterer).unwrap();
let centroids = clusterer.calc_centers(Center::Centroid, &labels).unwrap();
assert_eq!(2, centroids.len());
assert!(centroids.contains(&vec![3.8, 4.0]) && centroids.contains(&vec![1.12, 1.34]));
}
pub(crate) fn test_calc_medoids(cluster_fn: ClusterFn) {
let data: Vec<Vec<f32>> = vec![
vec![1.3, 1.2],
vec![1.2, 1.3],
vec![1.5, 1.5],
vec![1.6, 1.7],
vec![1.7, 1.6],
vec![6.3, 6.2],
vec![6.2, 6.3],
vec![6.5, 6.5],
vec![6.6, 6.7],
vec![6.7, 6.6],
];
let clusterer = Hdbscan::default_hyper_params(&data);
let result = cluster_fn(&clusterer).unwrap();
let centers = clusterer.calc_centers(Center::Medoid, &result).unwrap();
let unique_clusters = result
.iter()
.filter(|&&label| label != -1)
.collect::<HashSet<_>>();
assert_eq!(centers.len(), unique_clusters.len());
centers
.iter()
.for_each(|center| assert!(data.contains(center)));
assert_eq!(vec![1.5, 1.5], centers[0]);
assert_eq!(vec![6.5, 6.5], centers[1]);
}
pub(crate) fn cluster_test_data() -> Vec<Vec<f32>> {
vec![
vec![1.5, 2.2],
vec![1.0, 1.1],
vec![1.2, 1.4],
vec![0.8, 1.0],
vec![1.1, 1.0],
vec![3.7, 4.0],
vec![3.9, 3.9],
vec![3.6, 4.1],
vec![3.8, 3.9],
vec![4.0, 4.1],
vec![10.0, 10.0],
]
}
pub(crate) fn test_nyc_landmarks_haversine(cluster_fn: ClusterFn) {
let data = vec![
vec![40.6892, -74.0445], vec![40.7036, -74.0141], vec![40.7033, -74.0170], vec![40.7812, -73.9665], vec![40.7794, -73.9632], vec![40.7729, -73.9734], vec![40.7580, -73.9855], vec![40.7614, -73.9776], vec![40.7505, -73.9934], vec![40.6413, -74.0781], ];
let hyper_params = HdbscanHyperParams::builder()
.min_cluster_size(2)
.min_samples(1)
.dist_metric(DistanceMetric::Haversine)
.epsilon(500.0)
.nn_algorithm(NnAlgorithm::BruteForce)
.build();
let clusterer = Hdbscan::new(&data, hyper_params);
let result = cluster_fn(&clusterer).unwrap();
let unique_clusters: HashSet<_> = result.iter().filter(|&&x| x != -1).collect();
assert_eq!(unique_clusters.len(), 3, "Should have 3 distinct clusters");
assert_eq!(
result.iter().filter(|&&x| x == -1).count(),
1,
"Should have 1 noise point"
);
assert_eq!(result[0], result[1]);
assert_eq!(result[1], result[2]);
assert_eq!(result[3], result[4]);
assert_eq!(result[4], result[5]);
assert_eq!(result[6], result[7]);
assert_eq!(result[7], result[8]);
assert_eq!(result[9], -1);
}
pub(crate) fn test_geo_cluster_across_180th_meridian(cluster_fn: ClusterFn) {
let data = vec![
vec![-16.8410, 179.9813], vec![-16.7480, -179.9670], vec![51.5085, -0.1257], ];
let hyper_params = HdbscanHyperParams::builder()
.dist_metric(DistanceMetric::Haversine)
.allow_single_cluster(true)
.min_cluster_size(2)
.min_samples(1)
.build();
let clusterer = Hdbscan::new(&data, hyper_params);
let labels = cluster_fn(&clusterer).unwrap();
assert_eq!(
1,
labels
.iter()
.filter(|&&x| x != -1)
.collect::<HashSet<_>>()
.len()
);
assert_eq!(-1, labels[2]);
let centroids = clusterer
.calc_centers(Center::GeoCentroid, &labels)
.unwrap();
let cluster_longitude = centroids[0][1];
assert!(cluster_longitude > 179.0 || cluster_longitude < -179.0);
}
pub(crate) fn test_cylindrical_hsv_colours(cluster_fn: ClusterFn) {
let data = vec![
vec![0.91, 3.80482, 0.62],
vec![0.96, 4.13643, 0.86],
vec![0.95, 3.56047, 0.85],
vec![0.74, 1.91986, 0.39],
vec![0.90, 1.69297, 0.82],
vec![0.84, 2.14675, 0.72],
vec![0.60, 6.2657, 0.00],
];
let hyper_params = HdbscanHyperParams::builder()
.dist_metric(DistanceMetric::Cylindrical)
.nn_algorithm(NnAlgorithm::BruteForce)
.min_cluster_size(3)
.min_samples(1)
.build();
let clusterer = Hdbscan::new(&data, hyper_params);
let result = cluster_fn(&clusterer).unwrap();
assert_eq!(1, result[..3].iter().collect::<HashSet<_>>().len());
assert_eq!(1, result[3..6].iter().collect::<HashSet<_>>().len());
assert_eq!(-1, result[6]);
}
pub(crate) fn test_precomputed_distances(cluster_fn: ClusterFn) {
let dist_matrix = vec![
vec![0.0, 0.1, 0.2, 0.3, 9.0],
vec![0.1, 0.0, 0.1, 0.2, 9.0],
vec![0.2, 0.1, 0.0, 0.1, 9.0],
vec![0.3, 0.2, 0.1, 0.0, 9.0],
vec![9.0, 9.0, 9.0, 9.0, 9.0],
];
let hyper_params = HdbscanHyperParams::builder()
.dist_metric(DistanceMetric::Precalculated)
.allow_single_cluster(true)
.min_cluster_size(2)
.min_samples(1)
.build();
let clusterer = Hdbscan::new(&dist_matrix, hyper_params);
let result = cluster_fn(&clusterer).unwrap();
let unique_clusters: HashSet<_> = result.iter().filter(|&&x| x != -1).collect();
assert_eq!(unique_clusters.len(), 1, "Should have 1 distinct cluster");
assert_eq!(result[result.len() - 1], -1, "Should have 0 noise points");
}