use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
use scirs2_core::numeric::{Float, FromPrimitive};
use std::cmp::Ordering;
use std::fmt::Debug;
use super::distance;
use super::DistanceMetric;
use crate::error::{ClusteringError, Result};
#[derive(Debug, Clone)]
pub struct HDBSCANResult<F: Float> {
pub labels: Array1<i32>,
pub probabilities: Array1<F>,
pub condensed_tree: Option<CondensedTree<F>>,
pub single_linkage_tree: Option<SingleLinkageTree<F>>,
pub centroids: Option<Array2<F>>,
pub medoids: Option<Array1<usize>>,
}
#[derive(Debug, Clone)]
pub struct SingleLinkageTree<F: Float> {
pub left_child: Vec<i32>,
pub right_child: Vec<i32>,
pub distances: Vec<F>,
pub sizes: Vec<usize>,
}
#[derive(Debug, Clone)]
pub struct CondensedTree<F: Float> {
pub parent: Vec<i32>,
pub child: Vec<i32>,
pub lambda_val: Vec<F>,
pub sizes: Vec<usize>,
}
#[derive(Debug, Clone, PartialEq)]
#[allow(dead_code)]
struct MSTElement<F: Float> {
point1: usize,
point2: usize,
distance: F,
}
impl<F: Float> Eq for MSTElement<F> {}
impl<F: Float> PartialOrd for MSTElement<F> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<F: Float> Ord for MSTElement<F> {
fn cmp(&self, other: &Self) -> Ordering {
other
.distance
.partial_cmp(&self.distance)
.unwrap_or(Ordering::Equal)
}
}
#[derive(Debug, Clone)]
pub struct HDBSCANOptions<F: Float> {
pub min_cluster_size: usize,
pub minsamples: Option<usize>,
pub cluster_selection_epsilon: F,
pub max_cluster_size: Option<usize>,
pub cluster_selection_method: ClusterSelectionMethod,
pub allow_single_cluster: bool,
pub store_centers: Option<StoreCenter>,
pub metric: DistanceMetric,
pub alpha: F,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ClusterSelectionMethod {
EOM,
Leaf,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum StoreCenter {
Centroid,
Medoid,
Both,
}
impl<F: Float + FromPrimitive> Default for HDBSCANOptions<F> {
fn default() -> Self {
Self {
min_cluster_size: 5,
minsamples: None,
cluster_selection_epsilon: F::zero(),
max_cluster_size: None,
cluster_selection_method: ClusterSelectionMethod::EOM,
allow_single_cluster: false,
store_centers: None,
metric: DistanceMetric::Euclidean,
alpha: F::one(),
}
}
}
#[allow(dead_code)]
pub fn hdbscan<F>(
data: ArrayView2<F>,
options: Option<HDBSCANOptions<F>>,
) -> Result<HDBSCANResult<F>>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let nsamples = data.shape()[0];
if nsamples == 0 {
return Err(ClusteringError::InvalidInput("Empty input data".into()));
}
let opts = options.unwrap_or_default();
let minsamples = opts.minsamples.unwrap_or(opts.min_cluster_size);
if minsamples < 2 {
return Err(ClusteringError::InvalidInput(
"minsamples must be at least 2".into(),
));
}
if opts.min_cluster_size < 2 {
return Err(ClusteringError::InvalidInput(
"min_cluster_size must be at least 2".into(),
));
}
let core_distances = compute_core_distances(data, minsamples, opts.metric)?;
let mutual_reachability = compute_mutual_reachability(data, &core_distances, opts.metric)?;
let mst = build_mst(&mutual_reachability)?;
let single_linkage_tree = mst_to_single_linkage(&mst, nsamples)?;
let condensed_tree = condense_tree(&single_linkage_tree, opts.min_cluster_size)?;
let (labels, probabilities) = extract_clusters(
&condensed_tree,
opts.cluster_selection_method,
opts.allow_single_cluster,
nsamples,
)?;
let (centroids, medoids) = if opts.store_centers.is_some() {
compute_centers(data, &labels, &opts.store_centers)?
} else {
(None, None)
};
Ok(HDBSCANResult {
labels,
probabilities,
condensed_tree: Some(condensed_tree),
single_linkage_tree: Some(single_linkage_tree),
centroids,
medoids,
})
}
type CentersResult<F> = (Option<Array2<F>>, Option<Array1<usize>>);
#[allow(dead_code)]
fn compute_centers<F>(
data: ArrayView2<F>,
labels: &Array1<i32>,
store_centers: &Option<StoreCenter>,
) -> Result<CentersResult<F>>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
if store_centers.is_none() {
return Ok((None, None));
}
let store_centers = store_centers.expect("Operation failed");
let compute_centroids =
store_centers == StoreCenter::Centroid || store_centers == StoreCenter::Both;
let compute_medoids =
store_centers == StoreCenter::Medoid || store_centers == StoreCenter::Both;
let n_clusters = labels
.iter()
.filter(|&&l| l >= 0)
.fold(0, |max, &l| max.max(l + 1));
if n_clusters == 0 {
return Ok((None, None));
}
let nsamples = data.shape()[0];
let n_features = data.shape()[1];
let centroids = if compute_centroids {
let mut centroids = Array2::<F>::zeros((n_clusters as usize, n_features));
let mut counts = vec![0; n_clusters as usize];
for i in 0..nsamples {
let label = labels[i];
if label >= 0 {
let cluster_idx = label as usize;
counts[cluster_idx] += 1;
for j in 0..n_features {
centroids[[cluster_idx, j]] = centroids[[cluster_idx, j]] + data[[i, j]];
}
}
}
for i in 0..n_clusters as usize {
if counts[i] > 0 {
for j in 0..n_features {
centroids[[i, j]] =
centroids[[i, j]] / F::from_usize(counts[i]).expect("Operation failed");
}
}
}
Some(centroids)
} else {
None
};
let medoids = if compute_medoids {
let mut medoids = Vec::with_capacity(n_clusters as usize);
for cluster_idx in 0..n_clusters {
let cluster_points: Vec<usize> = labels
.iter()
.enumerate()
.filter(|(_, &l)| l == cluster_idx)
.map(|(i, _)| i)
.collect();
if cluster_points.is_empty() {
medoids.push(0);
continue;
}
let mut min_dist_sum = F::infinity();
let mut medoid_idx = cluster_points[0];
for &point_idx in &cluster_points {
let mut dist_sum = F::zero();
for &other_idx in &cluster_points {
if point_idx != other_idx {
let point1 = data.row(point_idx).to_vec();
let point2 = data.row(other_idx).to_vec();
let dist = distance::euclidean(&point1, &point2);
dist_sum = dist_sum + dist;
}
}
if dist_sum < min_dist_sum {
min_dist_sum = dist_sum;
medoid_idx = point_idx;
}
}
medoids.push(medoid_idx);
}
Some(Array1::from(medoids))
} else {
None
};
Ok((centroids, medoids))
}
#[allow(dead_code)]
pub fn dbscan_clustering<F>(
hdbscan_result: &HDBSCANResult<F>,
cut_distance: F,
) -> Result<Array1<i32>>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let single_linkage_tree = match &hdbscan_result.single_linkage_tree {
Some(tree) => tree,
None => {
return Err(ClusteringError::InvalidInput(
"HDBSCAN _result doesn't contain a single-linkage tree".into(),
))
}
};
let cut_lambda = if cut_distance > F::zero() {
F::one() / cut_distance
} else {
return Err(ClusteringError::InvalidInput(
"cut_distance must be positive".into(),
));
};
let nsamples = hdbscan_result.labels.len();
let mut union_find = UnionFind::new(nsamples);
let lambdas: Vec<F> = single_linkage_tree
.distances
.iter()
.map(|&d| {
if d > F::zero() {
F::one() / d
} else {
F::infinity()
}
})
.collect();
for (i, &lambda) in lambdas.iter().enumerate() {
if lambda < cut_lambda {
continue;
}
let left = single_linkage_tree.left_child[i];
let right = single_linkage_tree.right_child[i];
if left < nsamples as i32 && left >= 0 && right < nsamples as i32 && right >= 0 {
union_find.union(left as usize, right as usize);
}
else if left < nsamples as i32 && left >= 0 {
let right_points = get_leaves(right, single_linkage_tree, nsamples as i32);
for &point in &right_points {
if point >= 0 && point < nsamples as i32 {
union_find.union(left as usize, point as usize);
}
}
} else if right < nsamples as i32 && right >= 0 {
let left_points = get_leaves(left, single_linkage_tree, nsamples as i32);
for &point in &left_points {
if point >= 0 && point < nsamples as i32 {
union_find.union(right as usize, point as usize);
}
}
}
else {
let left_points = get_leaves(left, single_linkage_tree, nsamples as i32);
let right_points = get_leaves(right, single_linkage_tree, nsamples as i32);
if !left_points.is_empty() && !right_points.is_empty() {
let left_rep = left_points[0];
for &point in &right_points {
if point >= 0
&& point < nsamples as i32
&& left_rep >= 0
&& left_rep < nsamples as i32
{
union_find.union(left_rep as usize, point as usize);
}
}
}
}
}
let mut labels = vec![-1; nsamples];
let mut cluster_map = std::collections::HashMap::new();
let mut next_label = 0;
for (i, label) in labels.iter_mut().enumerate().take(nsamples) {
let root = union_find.find(i);
if union_find.size(root) > 1 {
let cluster_label = *cluster_map.entry(root).or_insert_with(|| {
let label = next_label;
next_label += 1;
label
});
*label = cluster_label;
}
}
Ok(Array1::from(labels))
}
#[allow(dead_code)]
fn get_leaves(_node: i32, tree: &SingleLinkageTree<impl Float>, nsamples: i32) -> Vec<i32> {
let mut leaves = Vec::new();
if _node < nsamples && _node >= 0 {
leaves.push(_node);
return leaves;
}
let node_idx = (_node - nsamples) as usize;
if node_idx >= tree.left_child.len() {
return leaves;
}
let left = tree.left_child[node_idx];
let right = tree.right_child[node_idx];
leaves.extend(get_leaves(left, tree, nsamples));
leaves.extend(get_leaves(right, tree, nsamples));
leaves
}
#[allow(dead_code)]
fn mutual_reachability_distance<F: Float>(distance: F, core_dist1: F, coredist2: F) -> F {
distance.max(core_dist1).max(coredist2)
}
#[allow(dead_code)]
fn compute_core_distances<F>(
data: ArrayView2<F>,
minsamples: usize,
metric: DistanceMetric,
) -> Result<Array1<F>>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let nsamples = data.shape()[0];
if minsamples > nsamples {
return Err(ClusteringError::InvalidInput(format!(
"minsamples ({}) cannot be larger than the number of _samples ({})",
minsamples, nsamples
)));
}
let mut core_distances = Array1::<F>::zeros(nsamples);
let mut distances = Array2::<F>::zeros((nsamples, nsamples));
for i in 0..nsamples {
for j in (i + 1)..nsamples {
let point1 = data.row(i).to_vec();
let point2 = data.row(j).to_vec();
let dist = match metric {
DistanceMetric::Euclidean => distance::euclidean(&point1, &point2),
DistanceMetric::Manhattan => distance::manhattan(&point1, &point2),
DistanceMetric::Chebyshev => distance::chebyshev(&point1, &point2),
DistanceMetric::Minkowski => distance::minkowski(
&point1,
&point2,
F::from(3.0).expect("Failed to convert constant to float"),
),
};
distances[[i, j]] = dist;
distances[[j, i]] = dist; }
}
for i in 0..nsamples {
let mut row_distances: Vec<F> = Vec::with_capacity(nsamples - 1);
for j in 0..nsamples {
if i != j {
row_distances.push(distances[[i, j]]);
}
}
row_distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
if minsamples > 1 && minsamples - 1 < row_distances.len() {
core_distances[i] = row_distances[minsamples - 2];
} else {
core_distances[i] = if row_distances.is_empty() {
F::zero() } else {
row_distances[0]
};
}
}
Ok(core_distances)
}
#[allow(dead_code)]
fn compute_mutual_reachability<F>(
data: ArrayView2<F>,
core_distances: &Array1<F>,
metric: DistanceMetric,
) -> Result<Array2<F>>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let nsamples = data.shape()[0];
let mut mutual_reachability = Array2::<F>::zeros((nsamples, nsamples));
for i in 0..nsamples {
for j in (i + 1)..nsamples {
let point1 = data.row(i).to_vec();
let point2 = data.row(j).to_vec();
let dist = match metric {
DistanceMetric::Euclidean => distance::euclidean(&point1, &point2),
DistanceMetric::Manhattan => distance::manhattan(&point1, &point2),
DistanceMetric::Chebyshev => distance::chebyshev(&point1, &point2),
DistanceMetric::Minkowski => distance::minkowski(
&point1,
&point2,
F::from(3.0).expect("Failed to convert constant to float"),
),
};
let mrd = mutual_reachability_distance(dist, core_distances[i], core_distances[j]);
mutual_reachability[[i, j]] = mrd;
mutual_reachability[[j, i]] = mrd; }
mutual_reachability[[i, i]] = core_distances[i];
}
Ok(mutual_reachability)
}
#[allow(dead_code)]
fn build_mst<F>(distances: &Array2<F>) -> Result<Vec<(usize, usize, F)>>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let nsamples = distances.shape()[0];
let mut mst_edges = Vec::with_capacity(nsamples - 1);
let mut in_mst = vec![false; nsamples];
let mut min_distances = vec![F::infinity(); nsamples];
let mut source_nodes = vec![0; nsamples];
let mut current_node = 0;
min_distances[current_node] = F::zero();
for _ in 0..(nsamples - 1) {
in_mst[current_node] = true;
for j in 0..nsamples {
if !in_mst[j] {
let distance = distances[[current_node, j]];
if distance < min_distances[j] {
min_distances[j] = distance;
source_nodes[j] = current_node;
}
}
}
let mut min_dist = F::infinity();
let mut next_node = 0;
for j in 0..nsamples {
if !in_mst[j] && min_distances[j] < min_dist {
min_dist = min_distances[j];
next_node = j;
}
}
if min_dist.is_infinite() {
return Err(ClusteringError::ComputationError(
"Graph is disconnected; HDBSCAN requires a connected graph.".into(),
));
}
mst_edges.push((source_nodes[next_node], next_node, min_dist));
current_node = next_node;
}
Ok(mst_edges)
}
struct UnionFind {
parent: Vec<usize>,
size: Vec<usize>,
}
impl UnionFind {
fn new(n: usize) -> Self {
let mut parent = Vec::with_capacity(n);
let mut size = Vec::with_capacity(n);
for i in 0..n {
parent.push(i);
size.push(1);
}
UnionFind { parent, size }
}
fn find(&mut self, x: usize) -> usize {
if self.parent[x] != x {
self.parent[x] = self.find(self.parent[x]);
}
self.parent[x]
}
fn union(&mut self, x: usize, y: usize) -> usize {
let root_x = self.find(x);
let root_y = self.find(y);
if root_x == root_y {
return root_x; }
if self.size[root_x] < self.size[root_y] {
self.parent[root_x] = root_y;
self.size[root_y] += self.size[root_x];
root_y
} else {
self.parent[root_y] = root_x;
self.size[root_x] += self.size[root_y];
root_x
}
}
fn size(&mut self, x: usize) -> usize {
let root = self.find(x);
self.size[root]
}
}
#[allow(dead_code)]
fn mst_to_single_linkage<F>(
mst: &[(usize, usize, F)],
nsamples: usize,
) -> Result<SingleLinkageTree<F>>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let mut sorted_mst = mst.to_vec();
sorted_mst.sort_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(Ordering::Equal));
let mut left_child = Vec::with_capacity(nsamples - 1);
let mut right_child = Vec::with_capacity(nsamples - 1);
let mut distances = Vec::with_capacity(nsamples - 1);
let mut sizes = Vec::with_capacity(nsamples - 1);
let total_nodes = nsamples + (nsamples - 1);
let mut union_find = UnionFind::new(total_nodes);
let mut next_id = nsamples;
for (source, dest, distance) in sorted_mst {
let cluster1 = union_find.find(source);
let cluster2 = union_find.find(dest);
if cluster1 == cluster2 {
continue;
}
let size1 = union_find.size(cluster1);
let size2 = union_find.size(cluster2);
left_child.push(cluster1 as i32);
right_child.push(cluster2 as i32);
distances.push(distance);
sizes.push(size1 + size2);
union_find.union(cluster1, cluster2);
union_find.parent[cluster1] = next_id;
union_find.parent[cluster2] = next_id;
next_id += 1;
}
Ok(SingleLinkageTree {
left_child,
right_child,
distances,
sizes,
})
}
#[allow(dead_code)]
fn condense_tree<F>(
single_linkage_tree: &SingleLinkageTree<F>,
min_cluster_size: usize,
) -> Result<CondensedTree<F>>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let n_merges = single_linkage_tree.distances.len();
let nsamples = n_merges + 1;
let mut parent = Vec::new();
let mut child = Vec::new();
let mut lambda_val = Vec::new();
let mut sizes = Vec::new();
let mut lambdas: Vec<F> = single_linkage_tree
.distances
.iter()
.map(|&d| {
if d > F::zero() {
F::one() / d
} else {
F::zero()
}
})
.collect();
lambdas.push(F::zero());
let min_lambda = if lambdas.is_empty() {
F::zero()
} else {
let max_val = lambdas.iter().fold(F::zero(), |max, &val| max.max(val));
max_val / F::from(1000.0).expect("Failed to convert constant to float") };
let mut node_sizes = vec![1; nsamples];
for &_size in &single_linkage_tree.sizes {
node_sizes.push(_size);
}
let mut cluster_map = std::collections::HashMap::new();
let mut next_cluster_id = nsamples;
for (i, ¤t_lambda) in lambdas.iter().enumerate().take(n_merges) {
let left = single_linkage_tree.left_child[i];
let right = single_linkage_tree.right_child[i];
let current_parent = nsamples + i;
let left_size = node_sizes[left as usize];
let right_size = node_sizes[right as usize];
if left_size >= min_cluster_size {
let mapped_left = *cluster_map.entry(left).or_insert_with(|| {
let id = next_cluster_id;
next_cluster_id += 1;
id
});
parent.push(current_parent as i32);
child.push(mapped_left as i32);
lambda_val.push(current_lambda);
sizes.push(left_size);
} else if left >= 0 && left < nsamples as i32 {
parent.push(current_parent as i32);
child.push(-(left + 1)); lambda_val.push(current_lambda);
sizes.push(1);
}
if right_size >= min_cluster_size {
let mapped_right = *cluster_map.entry(right).or_insert_with(|| {
let id = next_cluster_id;
next_cluster_id += 1;
id
});
parent.push(current_parent as i32);
child.push(mapped_right as i32);
lambda_val.push(current_lambda);
sizes.push(right_size);
} else if right >= 0 && right < nsamples as i32 {
parent.push(current_parent as i32);
child.push(-(right + 1)); lambda_val.push(current_lambda);
sizes.push(1);
}
cluster_map.insert(current_parent as i32, current_parent);
}
let mut filtered_parent = Vec::new();
let mut filtered_child = Vec::new();
let mut filtered_lambda = Vec::new();
let mut filtered_sizes = Vec::new();
for i in 0..parent.len() {
if lambda_val[i] >= min_lambda {
filtered_parent.push(parent[i]);
filtered_child.push(child[i]);
filtered_lambda.push(lambda_val[i]);
filtered_sizes.push(sizes[i]);
}
}
Ok(CondensedTree {
parent: filtered_parent,
child: filtered_child,
lambda_val: filtered_lambda,
sizes: filtered_sizes,
})
}
#[allow(dead_code)]
fn extract_clusters<F>(
condensed_tree: &CondensedTree<F>,
method: ClusterSelectionMethod,
allow_single_cluster: bool,
nsamples: usize,
) -> Result<(Array1<i32>, Array1<F>)>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let root: i32 = condensed_tree.parent.iter().fold(0, |max, &p| max.max(p));
let mut is_leaf = std::collections::HashSet::new();
let mut parent_set = std::collections::HashSet::new();
for &parent in &condensed_tree.parent {
parent_set.insert(parent);
}
for &child in &condensed_tree.child {
if !parent_set.contains(&child) || child < 0 {
is_leaf.insert(child);
}
}
if method == ClusterSelectionMethod::Leaf {
let leaf_clusters: Vec<i32> = is_leaf.iter().filter(|&&node| node >= 0).cloned().collect();
let (labels, probabilities) =
assign_points_to_clusters(condensed_tree, &leaf_clusters, root, nsamples)?;
return Ok((labels, probabilities));
}
let mut subtree_stability = std::collections::HashMap::new();
let mut nodes_by_lambda: Vec<(usize, F)> = condensed_tree
.lambda_val
.iter()
.enumerate()
.map(|(i, &lambda)| (i, lambda))
.collect();
nodes_by_lambda.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal));
for (idx, _) in nodes_by_lambda {
let child = condensed_tree.child[idx];
let parent = condensed_tree.parent[idx];
let lambda = condensed_tree.lambda_val[idx];
let size = condensed_tree.sizes[idx];
if is_leaf.contains(&child) && child >= 0 {
let stability = lambda * F::from_usize(size).expect("Operation failed");
subtree_stability.insert(child, stability);
} else if child >= 0 {
let child_stability = *subtree_stability.get(&child).unwrap_or(&F::zero());
subtree_stability.insert(child, child_stability);
}
if child >= 0 {
let child_lambda = lambda;
let child_stability = *subtree_stability.get(&child).unwrap_or(&F::zero());
let parent_lambda = condensed_tree
.lambda_val
.iter()
.zip(condensed_tree.parent.iter())
.find(|(_, &p)| p == parent && p != root)
.map(|(l_, _)| *l_)
.unwrap_or(F::zero());
let lambda_diff = child_lambda - parent_lambda;
if lambda_diff > F::zero() {
let stability_delta = lambda_diff * F::from_usize(size).expect("Operation failed");
let parent_stability = subtree_stability.entry(parent).or_insert(F::zero());
*parent_stability = *parent_stability + child_stability + stability_delta;
}
}
}
let mut selected_clusters = std::collections::HashSet::new();
let mut to_process = vec![root];
while let Some(node) = to_process.pop() {
let children: Vec<i32> = condensed_tree
.parent
.iter()
.zip(condensed_tree.child.iter())
.filter(|&(p_, _)| *p_ == node)
.map(|(_, c)| *c)
.collect();
if children.is_empty() {
if node >= 0 {
selected_clusters.insert(node);
}
continue;
}
let node_stability = *subtree_stability.get(&node).unwrap_or(&F::zero());
let max_child_stability = children
.iter()
.filter(|&&c| c >= 0)
.map(|&c| *subtree_stability.get(&c).unwrap_or(&F::zero()))
.fold(F::zero(), |max, s| max.max(s));
if max_child_stability > node_stability || node == root {
to_process.extend(children.iter().filter(|&&c| c >= 0).cloned());
} else {
if node >= 0 {
selected_clusters.insert(node);
}
}
}
let selected_clusters_vec: Vec<i32> = selected_clusters.into_iter().collect();
if selected_clusters_vec.is_empty() && allow_single_cluster {
let highest_child = condensed_tree
.child
.iter()
.filter(|&&c| c >= 0 && c != root)
.cloned()
.max()
.unwrap_or(-1);
if highest_child >= 0 {
let (labels, probabilities) =
assign_points_to_clusters(condensed_tree, &[highest_child], root, nsamples)?;
return Ok((labels, probabilities));
}
}
let (labels, probabilities) =
assign_points_to_clusters(condensed_tree, &selected_clusters_vec, root, nsamples)?;
Ok((labels, probabilities))
}
#[allow(dead_code)]
fn assign_points_to_clusters<F>(
condensed_tree: &CondensedTree<F>,
selected_clusters: &[i32],
root: i32,
nsamples: usize,
) -> Result<(Array1<i32>, Array1<F>)>
where
F: Float + FromPrimitive + Debug + PartialOrd,
{
let mut labels = vec![-1; nsamples];
let mut probabilities = vec![F::zero(); nsamples];
let mut leaf_cluster_map = std::collections::HashMap::new();
for &cluster_id in selected_clusters {
leaf_cluster_map.insert(cluster_id, cluster_id);
}
for point_idx in 0..nsamples {
let point_label = -(point_idx as i32 + 1);
let point_edges: Vec<(i32, F)> = condensed_tree
.child
.iter()
.zip(condensed_tree.parent.iter())
.zip(condensed_tree.lambda_val.iter())
.filter(|&((c, _), _)| *c == point_label)
.map(|((&_, &p), &lambda)| (p, lambda))
.collect();
if point_edges.is_empty() {
continue;
}
let mut max_lambda = F::zero();
let mut cluster_label = -1;
for (node, lambda) in point_edges {
let mut current_node = node;
let mut path_max_lambda = lambda;
loop {
if leaf_cluster_map.contains_key(¤t_node) {
if path_max_lambda > max_lambda {
max_lambda = path_max_lambda;
cluster_label = *leaf_cluster_map
.get(¤t_node)
.expect("Operation failed");
}
break;
}
let parent_edges: Vec<(i32, F)> = condensed_tree
.child
.iter()
.zip(condensed_tree.parent.iter())
.zip(condensed_tree.lambda_val.iter())
.filter(|&((c, _), _)| *c == current_node)
.map(|((&_, &p), &lambda)| (p, lambda))
.collect();
if parent_edges.is_empty() || current_node == root {
break;
}
let (parent, parent_lambda) = parent_edges[0];
path_max_lambda = path_max_lambda.min(parent_lambda);
current_node = parent;
}
}
if cluster_label >= 0 {
labels[point_idx] = cluster_label;
probabilities[point_idx] = max_lambda;
}
}
let max_prob = probabilities.iter().fold(F::zero(), |max, &p| max.max(p));
if max_prob > F::zero() {
for prob in &mut probabilities {
*prob = *prob / max_prob;
}
}
let mut unique_labels = std::collections::HashSet::new();
for &label in &labels {
if label >= 0 {
unique_labels.insert(label);
}
}
let remap: std::collections::HashMap<i32, i32> = unique_labels
.iter()
.enumerate()
.map(|(i, &label)| (label, i as i32))
.collect();
let remapped_labels: Vec<i32> = labels
.iter()
.map(|&label| {
if label >= 0 {
*remap.get(&label).unwrap_or(&label)
} else {
label
}
})
.collect();
Ok((Array1::from(remapped_labels), Array1::from(probabilities)))
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::Array2;
#[test]
fn test_hdbscan_placeholder() {
let data = Array2::from_shape_vec(
(6, 2),
vec![
1.0, 2.0, 1.5, 1.8, 1.3, 2.2, 8.0, 9.0, 8.2, 8.8, 7.8, 9.1,
],
)
.expect("Operation failed");
let options = HDBSCANOptions {
minsamples: Some(2), min_cluster_size: 2, cluster_selection_epsilon: 0.0,
allow_single_cluster: true, ..Default::default()
};
let result = hdbscan(data.view(), Some(options));
assert!(result.is_ok(), "HDBSCAN failed: {:?}", result.err());
let result = result.expect("Operation failed");
assert_eq!(
result.labels.len(),
6,
"Labels length should be 6, got: {:?}",
result.labels
);
assert_eq!(
result.probabilities.len(),
6,
"Probabilities length should be 6, got: {:?}",
result.probabilities
);
}
#[test]
fn test_mutual_reachability() {
let d = 2.0;
let core1 = 1.0;
let core2 = 3.0;
let mrd = mutual_reachability_distance(d, core1, core2);
assert_eq!(mrd, 3.0);
}
}