use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
use scirs2_core::numeric::{Float, FromPrimitive};
use std::collections::BinaryHeap;
use std::fmt::Debug;
use crate::error::{ClusteringError, Result};
fn dist_sq<F: Float>(a: ArrayView1<F>, b: ArrayView1<F>) -> F {
let mut s = F::zero();
for i in 0..a.len().min(b.len()) {
let d = a[i] - b[i];
s = s + d * d;
}
s
}
fn pairwise_distances<F: Float + FromPrimitive + Debug>(data: ArrayView2<F>) -> Array2<F> {
let n = data.shape()[0];
let mut dists = Array2::<F>::zeros((n, n));
for i in 0..n {
for j in (i + 1)..n {
let d = dist_sq(data.row(i), data.row(j)).sqrt();
dists[[i, j]] = d;
dists[[j, i]] = d;
}
}
dists
}
fn knn_distances<F: Float + FromPrimitive + Debug>(
dist_mat: &Array2<F>,
k: usize,
) -> Vec<Vec<(usize, F)>> {
let n = dist_mat.shape()[0];
let mut result = Vec::with_capacity(n);
for i in 0..n {
let mut dists: Vec<(usize, F)> = (0..n)
.filter(|&j| j != i)
.map(|j| (j, dist_mat[[i, j]]))
.collect();
dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
dists.truncate(k);
result.push(dists);
}
result
}
#[derive(Debug, Clone)]
pub struct HdbscanStarConfig {
pub min_cluster_size: usize,
pub min_samples: usize,
pub compute_probabilities: bool,
}
impl Default for HdbscanStarConfig {
fn default() -> Self {
Self {
min_cluster_size: 5,
min_samples: 5,
compute_probabilities: true,
}
}
}
#[derive(Debug, Clone)]
pub struct HdbscanStarResult<F: Float> {
pub labels: Array1<i32>,
pub probabilities: Option<Array1<F>>,
pub cluster_stabilities: Vec<F>,
pub n_clusters: usize,
pub outlier_scores: Array1<F>,
}
#[derive(Debug, Clone)]
struct MstEdge<F: Float> {
i: usize,
j: usize,
weight: F,
}
pub fn hdbscan_star<F: Float + FromPrimitive + Debug>(
data: ArrayView2<F>,
config: &HdbscanStarConfig,
) -> Result<HdbscanStarResult<F>> {
let n = data.shape()[0];
if n == 0 {
return Err(ClusteringError::InvalidInput("Empty input data".into()));
}
if config.min_cluster_size < 2 {
return Err(ClusteringError::InvalidInput(
"min_cluster_size must be >= 2".into(),
));
}
if config.min_samples < 1 {
return Err(ClusteringError::InvalidInput(
"min_samples must be >= 1".into(),
));
}
let dist_mat = pairwise_distances(data);
let mpts = config.min_samples;
let mut core_dists = Array1::<F>::zeros(n);
for i in 0..n {
let mut dists: Vec<F> = (0..n)
.filter(|&j| j != i)
.map(|j| dist_mat[[i, j]])
.collect();
dists.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let k = (mpts - 1).min(dists.len().saturating_sub(1));
core_dists[i] = if k < dists.len() {
dists[k]
} else {
F::infinity()
};
}
let mut mrd = Array2::<F>::zeros((n, n));
for i in 0..n {
for j in (i + 1)..n {
let d = dist_mat[[i, j]];
let mr = d.max(core_dists[i]).max(core_dists[j]);
mrd[[i, j]] = mr;
mrd[[j, i]] = mr;
}
}
let mst = prims_mst(&mrd, n);
let mut sorted_edges = mst;
sorted_edges.sort_by(|a, b| {
a.weight
.partial_cmp(&b.weight)
.unwrap_or(std::cmp::Ordering::Equal)
});
let min_cs = config.min_cluster_size;
let (labels, stabilities) = extract_stable_clusters(&sorted_edges, n, min_cs);
let n_clusters = if stabilities.is_empty() {
0
} else {
labels
.iter()
.filter(|&&l| l >= 0)
.map(|&l| l)
.max()
.map(|m| m as usize + 1)
.unwrap_or(0)
};
let mut outlier_scores = Array1::<F>::zeros(n);
for i in 0..n {
if labels[i] < 0 {
outlier_scores[i] = F::one();
} else {
let ci = labels[i] as usize;
let mut min_core_dist = F::infinity();
for j in 0..n {
if j != i && labels[j] == labels[i] {
let d = mrd[[i, j]];
if d < min_core_dist {
min_core_dist = d;
}
}
}
if min_core_dist < F::infinity() {
let max_core = core_dists
.iter()
.copied()
.filter(|&d| d < F::infinity())
.fold(F::zero(), |a, b| a.max(b));
if max_core > F::epsilon() {
outlier_scores[i] = min_core_dist / max_core;
}
}
}
}
let probabilities = if config.compute_probabilities {
let mut probs = Array1::<F>::zeros(n);
for i in 0..n {
probs[i] = F::one() - outlier_scores[i].min(F::one());
}
Some(probs)
} else {
None
};
Ok(HdbscanStarResult {
labels,
probabilities,
cluster_stabilities: stabilities,
n_clusters,
outlier_scores,
})
}
fn prims_mst<F: Float>(dist_mat: &Array2<F>, n: usize) -> Vec<MstEdge<F>> {
if n <= 1 {
return Vec::new();
}
let mut in_tree = vec![false; n];
let mut key = vec![F::infinity(); n]; let mut parent = vec![0usize; n];
let mut edges = Vec::with_capacity(n - 1);
key[0] = F::zero();
for _ in 0..n {
let mut u = None;
let mut min_key = F::infinity();
for v in 0..n {
if !in_tree[v] && key[v] < min_key {
min_key = key[v];
u = Some(v);
}
}
let u = match u {
Some(v) => v,
None => break,
};
in_tree[u] = true;
if key[u] > F::zero() {
edges.push(MstEdge {
i: parent[u],
j: u,
weight: key[u],
});
}
for v in 0..n {
if !in_tree[v] && dist_mat[[u, v]] < key[v] {
key[v] = dist_mat[[u, v]];
parent[v] = u;
}
}
}
edges
}
fn extract_stable_clusters<F: Float + FromPrimitive + Debug>(
sorted_edges: &[MstEdge<F>],
n: usize,
min_cluster_size: usize,
) -> (Array1<i32>, Vec<F>) {
let mut parent_uf: Vec<usize> = (0..n).collect();
let mut size: Vec<usize> = vec![1; n];
fn find(parent: &mut Vec<usize>, x: usize) -> usize {
let mut root = x;
while parent[root] != root {
root = parent[root];
}
let mut cur = x;
while cur != root {
let next = parent[cur];
parent[cur] = root;
cur = next;
}
root
}
fn union(parent: &mut Vec<usize>, size: &mut Vec<usize>, a: usize, b: usize) -> usize {
let ra = find(parent, a);
let rb = find(parent, b);
if ra == rb {
return ra;
}
if size[ra] < size[rb] {
parent[ra] = rb;
size[rb] += size[ra];
rb
} else {
parent[rb] = ra;
size[ra] += size[rb];
ra
}
}
let mut component_birth: Vec<Option<F>> = vec![None; n];
for edge in sorted_edges {
let ri = find(&mut parent_uf, edge.i);
let rj = find(&mut parent_uf, edge.j);
if ri == rj {
continue;
}
let lambda = if edge.weight > F::epsilon() {
F::one() / edge.weight
} else {
F::infinity()
};
let new_root = union(&mut parent_uf, &mut size, ri, rj);
if size[new_root] >= min_cluster_size && component_birth[new_root].is_none() {
component_birth[new_root] = Some(lambda);
}
}
let mut final_parent: Vec<usize> = (0..n).collect();
let mut final_size: Vec<usize> = vec![1; n];
for edge in sorted_edges {
let _root = union(&mut final_parent, &mut final_size, edge.i, edge.j);
}
let mut cluster_map: std::collections::HashMap<usize, i32> = std::collections::HashMap::new();
let mut next_label = 0i32;
let mut labels = Array1::from_elem(n, -1i32);
for i in 0..n {
let root = find(&mut final_parent, i);
if final_size[root] >= min_cluster_size {
let label = cluster_map.entry(root).or_insert_with(|| {
let l = next_label;
next_label += 1;
l
});
labels[i] = *label;
}
}
let mut stabilities = Vec::new();
for (&root, &label) in &cluster_map {
let birth = component_birth[root].unwrap_or_else(|| F::zero());
let sz = F::from(final_size[root]).unwrap_or_else(|| F::one());
stabilities.push(birth * sz);
}
(labels, stabilities)
}
#[derive(Debug, Clone)]
pub struct AutoEpsilonConfig {
pub k: usize,
pub min_samples: usize,
pub sensitivity: f64,
}
impl Default for AutoEpsilonConfig {
fn default() -> Self {
Self {
k: 5,
min_samples: 5,
sensitivity: 1.0,
}
}
}
#[derive(Debug, Clone)]
pub struct AutoEpsilonResult<F: Float> {
pub labels: Array1<i32>,
pub epsilon: F,
pub k_distances: Vec<F>,
pub elbow_index: usize,
pub n_clusters: usize,
}
pub fn auto_epsilon_dbscan<F: Float + FromPrimitive + Debug>(
data: ArrayView2<F>,
config: &AutoEpsilonConfig,
) -> Result<AutoEpsilonResult<F>> {
let n = data.shape()[0];
if n == 0 {
return Err(ClusteringError::InvalidInput("Empty input data".into()));
}
if config.k == 0 {
return Err(ClusteringError::InvalidInput("k must be > 0".into()));
}
let dist_mat = pairwise_distances(data);
let k = config.k.min(n - 1);
let mut k_dists: Vec<F> = Vec::with_capacity(n);
for i in 0..n {
let mut row_dists: Vec<F> = (0..n)
.filter(|&j| j != i)
.map(|j| dist_mat[[i, j]])
.collect();
row_dists.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let kd = if k <= row_dists.len() {
row_dists[k - 1]
} else {
*row_dists.last().unwrap_or(&F::zero())
};
k_dists.push(kd);
}
k_dists.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let elbow_idx = find_elbow(&k_dists, config.sensitivity);
let epsilon = k_dists[elbow_idx];
let labels = run_dbscan(data, &dist_mat, epsilon, config.min_samples);
let n_clusters = labels
.iter()
.filter(|&&l| l >= 0)
.map(|&l| l)
.max()
.map(|m| m as usize + 1)
.unwrap_or(0);
Ok(AutoEpsilonResult {
labels,
epsilon,
k_distances: k_dists,
elbow_index: elbow_idx,
n_clusters,
})
}
fn find_elbow<F: Float + FromPrimitive + Debug>(values: &[F], sensitivity: f64) -> usize {
let n = values.len();
if n < 3 {
return n / 2;
}
let x0 = 0.0f64;
let y0 = values[0].to_f64().unwrap_or(0.0);
let x1 = (n - 1) as f64;
let y1 = values[n - 1].to_f64().unwrap_or(0.0);
let dx = x1 - x0;
let dy = y1 - y0;
let line_len = (dx * dx + dy * dy).sqrt().max(1e-15);
let mut max_dist = 0.0f64;
let mut elbow_idx = n / 2;
for i in 1..(n - 1) {
let xi = i as f64;
let yi = values[i].to_f64().unwrap_or(0.0);
let dist = ((dy * xi - dx * yi + x1 * y0 - y1 * x0).abs()) / line_len;
let adjusted = dist * sensitivity;
if adjusted > max_dist {
max_dist = adjusted;
elbow_idx = i;
}
}
elbow_idx
}
fn run_dbscan<F: Float + FromPrimitive + Debug>(
data: ArrayView2<F>,
dist_mat: &Array2<F>,
eps: F,
min_samples: usize,
) -> Array1<i32> {
let n = data.shape()[0];
let mut labels = vec![-2i32; n]; let mut cluster_id = 0i32;
for i in 0..n {
if labels[i] != -2 {
continue;
}
let neighbors: Vec<usize> = (0..n).filter(|&j| dist_mat[[i, j]] <= eps).collect();
if neighbors.len() < min_samples {
labels[i] = -1; continue;
}
labels[i] = cluster_id;
let mut queue = neighbors;
let mut head = 0;
while head < queue.len() {
let cur = queue[head];
head += 1;
if labels[cur] == -1 {
labels[cur] = cluster_id;
continue;
}
if labels[cur] != -2 {
continue;
}
labels[cur] = cluster_id;
let cur_nb: Vec<usize> = (0..n).filter(|&j| dist_mat[[cur, j]] <= eps).collect();
if cur_nb.len() >= min_samples {
for nb in cur_nb {
if labels[nb] == -2 || labels[nb] == -1 {
queue.push(nb);
}
}
}
}
cluster_id += 1;
}
Array1::from_vec(labels)
}
#[derive(Debug, Clone)]
pub struct SnnConfig {
pub k: usize,
pub snn_threshold: usize,
pub min_snn_neighbors: usize,
}
impl Default for SnnConfig {
fn default() -> Self {
Self {
k: 10,
snn_threshold: 3,
min_snn_neighbors: 3,
}
}
}
#[derive(Debug, Clone)]
pub struct SnnResult<F: Float> {
pub labels: Array1<i32>,
pub snn_similarity: Array2<F>,
pub n_clusters: usize,
}
pub fn snn_clustering<F: Float + FromPrimitive + Debug>(
data: ArrayView2<F>,
config: &SnnConfig,
) -> Result<SnnResult<F>> {
let n = data.shape()[0];
if n == 0 {
return Err(ClusteringError::InvalidInput("Empty input data".into()));
}
if config.k == 0 {
return Err(ClusteringError::InvalidInput("k must be > 0".into()));
}
let dist_mat = pairwise_distances(data);
let k = config.k.min(n - 1);
let knn = knn_distances(&dist_mat, k);
let mut snn_sim = Array2::<F>::zeros((n, n));
for i in 0..n {
let knn_i: std::collections::HashSet<usize> = knn[i].iter().map(|&(j, _)| j).collect();
for j in (i + 1)..n {
let knn_j: std::collections::HashSet<usize> =
knn[j].iter().map(|&(jj, _)| jj).collect();
let shared = knn_i.intersection(&knn_j).count();
let sim = F::from(shared).unwrap_or_else(|| F::zero());
snn_sim[[i, j]] = sim;
snn_sim[[j, i]] = sim;
}
}
let threshold = F::from(config.snn_threshold).unwrap_or_else(|| F::one());
let min_nb = config.min_snn_neighbors;
let mut labels = vec![-2i32; n];
let mut cluster_id = 0i32;
for i in 0..n {
if labels[i] != -2 {
continue;
}
let neighbors: Vec<usize> = (0..n)
.filter(|&j| j != i && snn_sim[[i, j]] >= threshold)
.collect();
if neighbors.len() < min_nb {
labels[i] = -1;
continue;
}
labels[i] = cluster_id;
let mut queue = neighbors;
let mut head = 0;
while head < queue.len() {
let cur = queue[head];
head += 1;
if labels[cur] == -1 {
labels[cur] = cluster_id;
continue;
}
if labels[cur] != -2 {
continue;
}
labels[cur] = cluster_id;
let cur_nb: Vec<usize> = (0..n)
.filter(|&j| j != cur && snn_sim[[cur, j]] >= threshold)
.collect();
if cur_nb.len() >= min_nb {
for nb in cur_nb {
if labels[nb] == -2 || labels[nb] == -1 {
queue.push(nb);
}
}
}
}
cluster_id += 1;
}
let n_clusters = labels
.iter()
.filter(|&&l| l >= 0)
.max()
.map(|&m| m as usize + 1)
.unwrap_or(0);
Ok(SnnResult {
labels: Array1::from_vec(labels),
snn_similarity: snn_sim,
n_clusters,
})
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum KdeKernel {
Gaussian,
Epanechnikov,
Uniform,
}
#[derive(Debug, Clone)]
pub struct KdcConfig {
pub kernel: KdeKernel,
pub bandwidth: f64,
pub density_threshold: f64,
pub merge_distance: f64,
}
impl Default for KdcConfig {
fn default() -> Self {
Self {
kernel: KdeKernel::Gaussian,
bandwidth: 0.0,
density_threshold: 0.1,
merge_distance: 0.0,
}
}
}
#[derive(Debug, Clone)]
pub struct KdcResult<F: Float> {
pub labels: Array1<i32>,
pub densities: Array1<F>,
pub bandwidth: F,
pub n_clusters: usize,
}
pub fn kernel_density_clustering<F: Float + FromPrimitive + Debug>(
data: ArrayView2<F>,
config: &KdcConfig,
) -> Result<KdcResult<F>> {
let (n, d) = (data.shape()[0], data.shape()[1]);
if n == 0 {
return Err(ClusteringError::InvalidInput("Empty input data".into()));
}
let h = if config.bandwidth <= 0.0 {
silverman_bandwidth(data)
} else {
F::from(config.bandwidth).unwrap_or_else(|| F::one())
};
if h <= F::epsilon() {
return Err(ClusteringError::ComputationError(
"Bandwidth too small".into(),
));
}
let mut densities = Array1::<F>::zeros(n);
let n_f = F::from(n).unwrap_or_else(|| F::one());
let h_d = h.powi(d as i32);
let norm_factor = n_f * h_d;
for i in 0..n {
let mut dens = F::zero();
for j in 0..n {
let u_sq = dist_sq(data.row(i), data.row(j)) / (h * h);
let kval = match config.kernel {
KdeKernel::Gaussian => {
let neg_half = F::from(-0.5).unwrap_or_else(|| F::zero());
(neg_half * u_sq).exp()
}
KdeKernel::Epanechnikov => {
if u_sq < F::one() {
F::one() - u_sq
} else {
F::zero()
}
}
KdeKernel::Uniform => {
if u_sq < F::one() {
F::one()
} else {
F::zero()
}
}
};
dens = dens + kval;
}
densities[i] = dens / norm_factor;
}
let max_density = densities.iter().copied().fold(F::zero(), |a, b| a.max(b));
let threshold = F::from(config.density_threshold).unwrap_or_else(|| F::zero()) * max_density;
let mut peak_assignments = vec![-1i32; n];
let mut peaks: Vec<usize> = Vec::new();
let mut local_max = vec![0usize; n];
for i in 0..n {
let mut current = i;
for _ in 0..100 {
let mut best = current;
let mut best_dens = densities[current];
for j in 0..n {
if dist_sq(data.row(current), data.row(j)).sqrt()
<= h * F::from(2.0).unwrap_or_else(|| F::one())
{
if densities[j] > best_dens {
best_dens = densities[j];
best = j;
}
}
}
if best == current {
break;
}
current = best;
}
local_max[i] = current;
}
let merge_dist = if config.merge_distance > 0.0 {
F::from(config.merge_distance).unwrap_or_else(|| h)
} else {
h
};
let mut peak_map: std::collections::HashMap<usize, i32> = std::collections::HashMap::new();
let mut next_label = 0i32;
for i in 0..n {
if densities[i] < threshold {
peak_assignments[i] = -1;
continue;
}
let peak = local_max[i];
let mut merged_label = None;
for (&existing_peak, &label) in &peak_map {
if dist_sq(data.row(peak), data.row(existing_peak)).sqrt() <= merge_dist {
merged_label = Some(label);
break;
}
}
let label = match merged_label {
Some(l) => l,
None => {
let l = next_label;
peak_map.insert(peak, l);
peaks.push(peak);
next_label += 1;
l
}
};
peak_assignments[i] = label;
peak_map.entry(peak).or_insert(label);
}
let n_clusters = next_label as usize;
Ok(KdcResult {
labels: Array1::from_vec(peak_assignments),
densities,
bandwidth: h,
n_clusters,
})
}
fn silverman_bandwidth<F: Float + FromPrimitive + Debug>(data: ArrayView2<F>) -> F {
let (n, d) = (data.shape()[0], data.shape()[1]);
if n < 2 {
return F::one();
}
let mut total_std = 0.0f64;
for dim in 0..d {
let mean = (0..n)
.map(|i| data[[i, dim]].to_f64().unwrap_or(0.0))
.sum::<f64>()
/ n as f64;
let var = (0..n)
.map(|i| {
let diff = data[[i, dim]].to_f64().unwrap_or(0.0) - mean;
diff * diff
})
.sum::<f64>()
/ (n - 1) as f64;
total_std += var.sqrt();
}
let avg_std = total_std / d as f64;
let h = 1.06 * avg_std * (n as f64).powf(-0.2);
F::from(h.max(1e-10)).unwrap_or_else(|| F::one())
}
#[derive(Debug, Clone)]
pub struct LofConfig {
pub k: usize,
pub outlier_threshold: f64,
}
impl Default for LofConfig {
fn default() -> Self {
Self {
k: 5,
outlier_threshold: 1.5,
}
}
}
#[derive(Debug, Clone)]
pub struct LofResult<F: Float> {
pub lof_scores: Array1<F>,
pub is_outlier: Vec<bool>,
pub n_outliers: usize,
pub k_distances: Array1<F>,
pub lrd: Array1<F>,
}
pub fn local_outlier_factor<F: Float + FromPrimitive + Debug>(
data: ArrayView2<F>,
config: &LofConfig,
) -> Result<LofResult<F>> {
let n = data.shape()[0];
if n == 0 {
return Err(ClusteringError::InvalidInput("Empty input data".into()));
}
if config.k == 0 || config.k >= n {
return Err(ClusteringError::InvalidInput(
"k must be in [1, n_samples)".into(),
));
}
let dist_mat = pairwise_distances(data);
let k = config.k;
let knn = knn_distances(&dist_mat, k);
let mut k_dist = Array1::<F>::zeros(n);
for i in 0..n {
k_dist[i] = if knn[i].len() >= k {
knn[i][k - 1].1
} else if let Some(last) = knn[i].last() {
last.1
} else {
F::zero()
};
}
let mut lrd = Array1::<F>::zeros(n);
for i in 0..n {
let mut sum_reach = F::zero();
let nb_count = knn[i].len();
for &(j, d_ij) in &knn[i] {
let reach = d_ij.max(k_dist[j]);
sum_reach = sum_reach + reach;
}
if nb_count > 0 && sum_reach > F::epsilon() {
lrd[i] = F::from(nb_count).unwrap_or_else(|| F::one()) / sum_reach;
} else {
lrd[i] = F::one(); }
}
let mut lof_scores = Array1::<F>::zeros(n);
for i in 0..n {
let nb_count = knn[i].len();
if nb_count == 0 || lrd[i] <= F::epsilon() {
lof_scores[i] = F::one();
continue;
}
let mut sum = F::zero();
for &(j, _) in &knn[i] {
sum = sum + lrd[j] / lrd[i];
}
lof_scores[i] = sum / F::from(nb_count).unwrap_or_else(|| F::one());
}
let threshold = F::from(config.outlier_threshold)
.unwrap_or_else(|| F::from(1.5).unwrap_or_else(|| F::one()));
let is_outlier: Vec<bool> = lof_scores.iter().map(|&s| s > threshold).collect();
let n_outliers = is_outlier.iter().filter(|&&o| o).count();
Ok(LofResult {
lof_scores,
is_outlier,
n_outliers,
k_distances: k_dist,
lrd,
})
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::Array2;
fn make_clustered_data() -> Array2<f64> {
let mut data = Vec::new();
for i in 0..15 {
let n = (i as f64 * 0.073).sin() * 0.2;
data.push(1.0 + n);
data.push(1.0 + n * 0.5);
}
for i in 0..15 {
let n = (i as f64 * 0.131).sin() * 0.2;
data.push(5.0 + n);
data.push(5.0 + n * 0.5);
}
data.push(10.0);
data.push(10.0);
Array2::from_shape_vec((31, 2), data).expect("shape failed")
}
#[test]
fn test_hdbscan_star_basic() {
let data = make_clustered_data();
let config = HdbscanStarConfig {
min_cluster_size: 3,
min_samples: 3,
compute_probabilities: true,
};
let result = hdbscan_star(data.view(), &config).expect("hdbscan* failed");
assert_eq!(result.labels.len(), 31);
assert!(result.n_clusters >= 1);
assert!(result.probabilities.is_some());
assert_eq!(result.outlier_scores.len(), 31);
}
#[test]
fn test_hdbscan_star_empty() {
let data = Array2::<f64>::zeros((0, 3));
let config = HdbscanStarConfig::default();
assert!(hdbscan_star(data.view(), &config).is_err());
}
#[test]
fn test_hdbscan_star_invalid_params() {
let data = make_clustered_data();
let config = HdbscanStarConfig {
min_cluster_size: 1,
..Default::default()
};
assert!(hdbscan_star(data.view(), &config).is_err());
}
#[test]
fn test_hdbscan_star_small_data() {
let data = Array2::from_shape_vec(
(5, 2),
vec![0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 5.0, 5.0, 5.1, 5.1],
)
.expect("shape");
let config = HdbscanStarConfig {
min_cluster_size: 2,
min_samples: 2,
compute_probabilities: false,
};
let result = hdbscan_star(data.view(), &config).expect("hdbscan* failed");
assert_eq!(result.labels.len(), 5);
assert!(result.probabilities.is_none());
}
#[test]
fn test_auto_epsilon_basic() {
let data = make_clustered_data();
let config = AutoEpsilonConfig {
k: 3,
min_samples: 3,
sensitivity: 1.0,
};
let result = auto_epsilon_dbscan(data.view(), &config).expect("auto-eps failed");
assert_eq!(result.labels.len(), 31);
assert!(result.epsilon > 0.0);
assert!(!result.k_distances.is_empty());
}
#[test]
fn test_auto_epsilon_empty() {
let data = Array2::<f64>::zeros((0, 2));
let config = AutoEpsilonConfig::default();
assert!(auto_epsilon_dbscan(data.view(), &config).is_err());
}
#[test]
fn test_auto_epsilon_invalid_k() {
let data = make_clustered_data();
let config = AutoEpsilonConfig {
k: 0,
..Default::default()
};
assert!(auto_epsilon_dbscan(data.view(), &config).is_err());
}
#[test]
fn test_find_elbow() {
let values = vec![0.1, 0.2, 0.3, 0.4, 0.8, 1.5, 3.0, 5.0, 8.0, 12.0];
let idx = find_elbow(&values, 1.0);
assert!(idx > 0 && idx < values.len() - 1);
}
#[test]
fn test_snn_basic() {
let data = make_clustered_data();
let config = SnnConfig {
k: 5,
snn_threshold: 2,
min_snn_neighbors: 2,
};
let result = snn_clustering(data.view(), &config).expect("snn failed");
assert_eq!(result.labels.len(), 31);
let n = result.snn_similarity.shape()[0];
for i in 0..n {
for j in 0..n {
let diff = (result.snn_similarity[[i, j]] - result.snn_similarity[[j, i]]).abs();
assert!(diff < 1e-10);
}
}
}
#[test]
fn test_snn_empty() {
let data = Array2::<f64>::zeros((0, 2));
let config = SnnConfig::default();
assert!(snn_clustering(data.view(), &config).is_err());
}
#[test]
fn test_kde_clustering_basic() {
let data = make_clustered_data();
let config = KdcConfig {
kernel: KdeKernel::Gaussian,
bandwidth: 1.0,
density_threshold: 0.05,
merge_distance: 1.0,
};
let result = kernel_density_clustering(data.view(), &config).expect("kde failed");
assert_eq!(result.labels.len(), 31);
assert_eq!(result.densities.len(), 31);
assert!(result.bandwidth > 0.0);
}
#[test]
fn test_kde_clustering_auto_bandwidth() {
let data = make_clustered_data();
let config = KdcConfig {
bandwidth: 0.0, ..Default::default()
};
let result = kernel_density_clustering(data.view(), &config).expect("kde failed");
assert!(result.bandwidth > 0.0);
}
#[test]
fn test_kde_clustering_epanechnikov() {
let data = make_clustered_data();
let config = KdcConfig {
kernel: KdeKernel::Epanechnikov,
bandwidth: 2.0,
density_threshold: 0.01,
merge_distance: 1.0,
};
let result = kernel_density_clustering(data.view(), &config).expect("kde failed");
assert_eq!(result.labels.len(), 31);
}
#[test]
fn test_kde_clustering_empty() {
let data = Array2::<f64>::zeros((0, 2));
let config = KdcConfig::default();
assert!(kernel_density_clustering(data.view(), &config).is_err());
}
#[test]
fn test_silverman_bandwidth() {
let data = Array2::from_shape_vec(
(10, 2),
vec![
1.0, 2.0, 1.1, 2.1, 1.2, 1.9, 0.9, 2.2, 1.3, 1.8, 5.0, 6.0, 5.1, 6.1, 5.2, 5.9,
4.9, 6.2, 5.3, 5.8,
],
)
.expect("shape");
let h = silverman_bandwidth(data.view());
assert!(h > 0.0);
}
#[test]
fn test_lof_basic() {
let data = make_clustered_data();
let config = LofConfig {
k: 5,
outlier_threshold: 1.5,
};
let result = local_outlier_factor(data.view(), &config).expect("lof failed");
assert_eq!(result.lof_scores.len(), 31);
assert_eq!(result.is_outlier.len(), 31);
assert_eq!(result.k_distances.len(), 31);
assert_eq!(result.lrd.len(), 31);
let outlier_score = result.lof_scores[30];
assert!(
outlier_score > 1.0,
"Outlier LOF score should be > 1, got {}",
outlier_score
);
}
#[test]
fn test_lof_empty() {
let data = Array2::<f64>::zeros((0, 2));
let config = LofConfig::default();
assert!(local_outlier_factor(data.view(), &config).is_err());
}
#[test]
fn test_lof_invalid_k() {
let data = make_clustered_data();
let config = LofConfig {
k: 0,
..Default::default()
};
assert!(local_outlier_factor(data.view(), &config).is_err());
let config2 = LofConfig {
k: 100, ..Default::default()
};
assert!(local_outlier_factor(data.view(), &config2).is_err());
}
#[test]
fn test_lof_uniform_data() {
let data = Array2::from_shape_vec(
(10, 2),
vec![
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0,
],
)
.expect("shape");
let config = LofConfig {
k: 3,
outlier_threshold: 1.5,
};
let result = local_outlier_factor(data.view(), &config).expect("lof failed");
for &score in result.lof_scores.iter() {
assert!(
(score - 1.0).abs() < 0.5,
"LOF for identical points should be ~1, got {}",
score
);
}
}
#[test]
fn test_prims_mst() {
let mut dist = Array2::<f64>::zeros((4, 4));
dist[[0, 1]] = 1.0;
dist[[1, 0]] = 1.0;
dist[[0, 2]] = 4.0;
dist[[2, 0]] = 4.0;
dist[[0, 3]] = 3.0;
dist[[3, 0]] = 3.0;
dist[[1, 2]] = 2.0;
dist[[2, 1]] = 2.0;
dist[[1, 3]] = 5.0;
dist[[3, 1]] = 5.0;
dist[[2, 3]] = 1.0;
dist[[3, 2]] = 1.0;
let mst = prims_mst(&dist, 4);
assert_eq!(mst.len(), 3);
let total_weight: f64 = mst.iter().map(|e| e.weight).sum();
assert!((total_weight - 4.0).abs() < 1e-10); }
#[test]
fn test_dist_sq() {
let a = Array1::from_vec(vec![1.0, 2.0]);
let b = Array1::from_vec(vec![4.0, 6.0]);
let d = dist_sq(a.view(), b.view());
assert!((d - 25.0).abs() < 1e-10);
}
#[test]
fn test_knn_distances() {
let dist = Array2::from_shape_vec(
(4, 4),
vec![
0.0, 1.0, 3.0, 5.0, 1.0, 0.0, 2.0, 4.0, 3.0, 2.0, 0.0, 1.0, 5.0, 4.0, 1.0, 0.0,
],
)
.expect("shape");
let knn = knn_distances(&dist, 2);
assert_eq!(knn.len(), 4);
assert_eq!(knn[0][0].0, 1);
assert!((knn[0][0].1 - 1.0).abs() < 1e-10);
}
}