#![allow(clippy::module_name_repetitions)]
#![allow(clippy::too_many_lines)]
use super::distance::{DistanceMetric, SquaredEuclidean};
use super::flat::DataRef;
use super::util::{self, UnionFind};
use crate::error::{Error, Result};
use rand::prelude::*;
use std::collections::HashMap;
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct EVoCParams<D: DistanceMetric = SquaredEuclidean> {
pub intermediate_dim: usize,
pub min_cluster_size: usize,
pub max_layers: usize,
pub noise_level: f32,
pub duplicate_threshold: f32,
pub seed: Option<u64>,
pub metric: D,
}
impl Default for EVoCParams<SquaredEuclidean> {
fn default() -> Self {
Self {
intermediate_dim: 15,
min_cluster_size: 10,
max_layers: 10,
noise_level: 0.5,
duplicate_threshold: 1e-6,
seed: None,
metric: SquaredEuclidean,
}
}
}
#[derive(Clone, Debug)]
pub struct ClusterLayer {
pub threshold: f32,
pub assignments: Vec<Option<usize>>,
pub num_clusters: usize,
pub clusters: HashMap<usize, Vec<usize>>,
}
#[derive(Clone, Debug)]
pub struct ClusterNode {
pub id: usize,
pub children: Vec<usize>,
pub distance: f32,
pub size: usize,
}
#[derive(Clone, Debug)]
pub struct ClusterHierarchy {
nodes: Vec<ClusterNode>,
root: Option<usize>,
}
impl ClusterHierarchy {
pub fn from_mst(edges: &[(usize, usize, f32)], num_points: usize) -> Self {
let mut nodes: Vec<ClusterNode> =
Vec::with_capacity(num_points.saturating_mul(2).saturating_sub(1));
for i in 0..num_points {
nodes.push(ClusterNode {
id: i,
children: Vec::new(),
distance: 0.0,
size: 1,
});
}
if num_points == 0 {
return Self { nodes, root: None };
}
if num_points == 1 {
return Self {
nodes,
root: Some(0),
};
}
let mut uf = UnionFind::new(num_points);
let mut comp_node: Vec<usize> = (0..num_points).collect();
for &(u, v, dist) in edges {
let ru = uf.find(u);
let rv = uf.find(v);
if ru == rv {
continue;
}
let left = comp_node[ru];
let right = comp_node[rv];
let new_id = nodes.len();
let new_size = nodes[left].size + nodes[right].size;
nodes.push(ClusterNode {
id: new_id,
children: vec![left, right],
distance: dist,
size: new_size,
});
let new_root = uf.union_roots(ru, rv);
comp_node[new_root] = new_id;
}
let root = nodes.len().checked_sub(1);
Self { nodes, root }
}
pub fn root(&self) -> Option<usize> {
self.root
}
pub fn nodes(&self) -> &[ClusterNode] {
&self.nodes
}
pub fn get_all_distances(&self) -> Vec<f32> {
self.nodes
.iter()
.filter(|n| !n.children.is_empty())
.map(|n| n.distance)
.collect()
}
}
#[derive(Clone, Debug)]
pub struct EVoC<D: DistanceMetric = SquaredEuclidean> {
params: EVoCParams<D>,
original_dim: Option<usize>,
mst_edges: Vec<(usize, usize, f32)>,
hierarchy: Option<ClusterHierarchy>,
cluster_layers: Vec<ClusterLayer>,
duplicates: Vec<Vec<usize>>,
}
impl EVoC<SquaredEuclidean> {
pub fn new(params: EVoCParams<SquaredEuclidean>) -> Self {
Self {
params,
original_dim: None,
mst_edges: Vec::new(),
hierarchy: None,
cluster_layers: Vec::new(),
duplicates: Vec::new(),
}
}
}
impl<D: DistanceMetric> EVoC<D> {
pub fn with_metric(params: EVoCParams<D>) -> Self {
Self {
params,
original_dim: None,
mst_edges: Vec::new(),
hierarchy: None,
cluster_layers: Vec::new(),
duplicates: Vec::new(),
}
}
pub fn fit(&mut self, data: &(impl DataRef + ?Sized)) -> Result<()> {
self.fit_inner(data)?;
Ok(())
}
pub fn fit_predict(&mut self, data: &(impl DataRef + ?Sized)) -> Result<Vec<Option<usize>>> {
let n = data.n();
self.fit_inner(data)?;
Ok(self
.cluster_layers
.first()
.map_or_else(|| vec![None; n], |layer| layer.assignments.clone()))
}
fn fit_inner(&mut self, data: &(impl DataRef + ?Sized)) -> Result<()> {
if data.n() == 0 {
return Err(Error::EmptyInput);
}
let n = data.n();
let d = data.d();
if d == 0 {
return Err(Error::InvalidParameter {
name: "dimension",
message: "must be at least 1",
});
}
for i in 1..n {
if data.row(i).len() != d {
return Err(Error::DimensionMismatch {
expected: d,
found: data.row(i).len(),
});
}
}
self.original_dim = Some(d);
util::validate_finite(data)?;
if self.params.intermediate_dim == 0 {
return Err(Error::InvalidParameter {
name: "intermediate_dim",
message: "must be at least 1",
});
}
if self.params.intermediate_dim >= d {
return Err(Error::InvalidParameter {
name: "intermediate_dim",
message: "must be less than the original dimension",
});
}
let mut flat: Vec<f32> = Vec::with_capacity(n * d);
for i in 0..n {
flat.extend_from_slice(data.row(i));
}
let reduced = project(&flat, n, d, self.params.intermediate_dim, self.params.seed);
let dim = self.params.intermediate_dim;
let metric = &self.params.metric;
let mut mst = util::prim_mst(n, |i, j| {
let ivec = &reduced[i * dim..(i + 1) * dim];
let jvec = &reduced[j * dim..(j + 1) * dim];
metric.distance(ivec, jvec)
});
for edge in &mut mst {
edge.2 = edge.2.sqrt();
}
mst.sort_by(|a, b| a.2.total_cmp(&b.2));
self.mst_edges = mst;
self.hierarchy = Some(ClusterHierarchy::from_mst(&self.mst_edges, n));
self.cluster_layers = extract_layers(
n,
&self.mst_edges,
self.params.min_cluster_size.max(1),
self.params.max_layers.max(1),
self.params.noise_level,
);
self.duplicates = detect_duplicates(n, &self.mst_edges, self.params.duplicate_threshold);
Ok(())
}
pub fn cluster_layers(&self) -> &[ClusterLayer] {
&self.cluster_layers
}
pub fn cluster_tree(&self) -> Option<&ClusterHierarchy> {
self.hierarchy.as_ref()
}
pub fn duplicates(&self) -> &[Vec<usize>] {
&self.duplicates
}
pub fn mst_edges(&self) -> &[(usize, usize, f32)] {
&self.mst_edges
}
pub fn original_dim(&self) -> Option<usize> {
self.original_dim
}
pub fn finest_layer(&self) -> Option<&ClusterLayer> {
self.cluster_layers.first()
}
pub fn coarsest_layer(&self) -> Option<&ClusterLayer> {
self.cluster_layers.last()
}
pub fn labels_finest(&self) -> Option<&[Option<usize>]> {
self.finest_layer().map(|l| l.assignments.as_slice())
}
pub fn labels_coarsest(&self) -> Option<&[Option<usize>]> {
self.coarsest_layer().map(|l| l.assignments.as_slice())
}
pub fn layer_at_threshold(&self, threshold: f32) -> Result<ClusterLayer> {
if !threshold.is_finite() || threshold < 0.0 {
return Err(Error::InvalidParameter {
name: "threshold",
message: "must be a finite, non-negative number",
});
}
let n = self
.cluster_layers
.first()
.map(|l| l.assignments.len())
.ok_or_else(|| Error::Other("EVoC is not fitted".to_string()))?;
Ok(layer_at_threshold(
n,
&self.mst_edges,
threshold,
self.params.min_cluster_size.max(1),
))
}
pub fn layer_for_n_clusters(&self, target_clusters: usize) -> Result<ClusterLayer> {
if target_clusters == 0 {
return Err(Error::InvalidParameter {
name: "target_clusters",
message: "must be at least 1",
});
}
let n = self
.cluster_layers
.first()
.map(|l| l.assignments.len())
.ok_or_else(|| Error::Other("EVoC is not fitted".to_string()))?;
let thr = best_threshold_for_n_clusters(
n,
&self.mst_edges,
self.params.min_cluster_size.max(1),
target_clusters,
);
Ok(layer_at_threshold(
n,
&self.mst_edges,
thr,
self.params.min_cluster_size.max(1),
))
}
}
fn project(
vectors: &[f32],
num_vectors: usize,
original_dim: usize,
intermediate_dim: usize,
seed: Option<u64>,
) -> Vec<f32> {
let mut rng = match seed {
Some(s) => StdRng::seed_from_u64(s),
None => StdRng::from_os_rng(),
};
let mut mat: Vec<f32> = Vec::with_capacity(intermediate_dim * original_dim);
for _ in 0..intermediate_dim {
let start = mat.len();
for _ in 0..original_dim {
mat.push(rng.random::<f32>() * 2.0 - 1.0);
}
normalize_in_place(&mut mat[start..start + original_dim]);
}
let mut out: Vec<f32> = Vec::with_capacity(num_vectors * intermediate_dim);
for i in 0..num_vectors {
let v = &vectors[i * original_dim..(i + 1) * original_dim];
for j in 0..intermediate_dim {
let row = &mat[j * original_dim..(j + 1) * original_dim];
out.push(dot(v, row));
}
}
out
}
fn normalize_in_place(v: &mut [f32]) {
let norm = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > f32::EPSILON {
for x in v {
*x /= norm;
}
}
}
#[inline]
fn dot(a: &[f32], b: &[f32]) -> f32 {
debug_assert_eq!(a.len(), b.len());
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
}
fn extract_layers(
n: usize,
mst_edges: &[(usize, usize, f32)],
min_cluster_size: usize,
max_layers: usize,
noise_level: f32,
) -> Vec<ClusterLayer> {
if n == 0 {
return Vec::new();
}
if mst_edges.is_empty() {
let (assignments, clusters, num_clusters) = if min_cluster_size <= 1 {
(
vec![Some(0)],
HashMap::from([(0usize, vec![0usize])]),
1usize,
)
} else {
(vec![None], HashMap::new(), 0usize)
};
return vec![ClusterLayer {
threshold: 0.0,
assignments,
num_clusters,
clusters,
}];
}
let mut dists: Vec<f32> = mst_edges.iter().map(|e| e.2).collect();
dists.sort_by(|a, b| a.total_cmp(b));
let layers = max_layers.min(dists.len()).max(1);
let bias = noise_level.clamp(0.0, 1.0);
let mut thresholds: Vec<f32> = Vec::with_capacity(layers);
if layers == 1 {
thresholds.push(dists[dists.len() - 1]);
} else {
for i in 0..layers {
let t = (i as f32) / ((layers - 1) as f32);
let t = t.powf(1.0 - bias + 1e-6);
let idx = (t * ((dists.len() - 1) as f32)).round() as usize;
thresholds.push(dists[idx.min(dists.len() - 1)]);
}
}
thresholds.sort_by(|a, b| a.total_cmp(b));
thresholds.dedup_by(|a, b| (*a - *b).abs() <= f32::EPSILON);
let mut layers_out: Vec<ClusterLayer> = thresholds
.into_iter()
.map(|thr| layer_at_threshold(n, mst_edges, thr, min_cluster_size))
.collect();
layers_out.sort_by_key(|b| std::cmp::Reverse(b.num_clusters));
layers_out
}
fn best_threshold_for_n_clusters(
n: usize,
mst_edges: &[(usize, usize, f32)],
min_cluster_size: usize,
target_clusters: usize,
) -> f32 {
if n <= 1 {
return 0.0;
}
let mut edges = mst_edges.to_vec();
edges.sort_by(|a, b| a.2.total_cmp(&b.2));
let mut uf = UnionFind::new(n);
let mut clusters = if min_cluster_size <= 1 { n } else { 0usize };
let mut assigned = if min_cluster_size <= 1 { n } else { 0usize };
let mut best_thr = 0.0f32;
let mut best_clusters = clusters;
let mut best_assigned = assigned;
let mut best_diff = best_clusters.abs_diff(target_clusters);
for &(u, v, d) in &edges {
let ru = uf.find(u);
let rv = uf.find(v);
if ru == rv {
continue;
}
let (ru_sz, rv_sz) = (uf.size[ru], uf.size[rv]);
let before_clusters =
usize::from(ru_sz >= min_cluster_size) + usize::from(rv_sz >= min_cluster_size);
let before_assigned = if ru_sz >= min_cluster_size { ru_sz } else { 0 }
+ if rv_sz >= min_cluster_size { rv_sz } else { 0 };
let new_root = uf.union_roots(ru, rv);
let new_sz = uf.size[new_root];
let after_clusters = usize::from(new_sz >= min_cluster_size);
let after_assigned = if new_sz >= min_cluster_size {
new_sz
} else {
0
};
clusters = clusters + after_clusters - before_clusters;
assigned = assigned + after_assigned - before_assigned;
let diff = clusters.abs_diff(target_clusters);
if diff < best_diff
|| (diff == best_diff && assigned > best_assigned)
|| (diff == best_diff && assigned == best_assigned && clusters > best_clusters)
{
best_diff = diff;
best_clusters = clusters;
best_assigned = assigned;
best_thr = d;
if best_diff == 0 && best_assigned == n {
break;
}
}
}
best_thr
}
fn layer_at_threshold(
n: usize,
mst_edges: &[(usize, usize, f32)],
threshold: f32,
min_cluster_size: usize,
) -> ClusterLayer {
let mut uf = UnionFind::new(n);
for &(u, v, d) in mst_edges {
if d <= threshold {
uf.union(u, v);
} else {
break;
}
}
let mut roots: Vec<usize> = Vec::with_capacity(n);
for i in 0..n {
roots.push(uf.find(i));
}
let mut counts: HashMap<usize, usize> = HashMap::new();
for &r in &roots {
*counts.entry(r).or_insert(0) += 1;
}
let mut cluster_roots: Vec<(usize, usize)> = counts
.iter()
.filter_map(|(&root, &count)| (count >= min_cluster_size).then_some((root, count)))
.collect();
cluster_roots.sort_by_key(|(root, _count)| *root);
let mut root_to_cluster: HashMap<usize, usize> = HashMap::with_capacity(cluster_roots.len());
let mut clusters: HashMap<usize, Vec<usize>> = HashMap::with_capacity(cluster_roots.len());
for (cid, (root, count)) in cluster_roots.into_iter().enumerate() {
root_to_cluster.insert(root, cid);
clusters.insert(cid, Vec::with_capacity(count));
}
let mut assignments: Vec<Option<usize>> = vec![None; n];
for i in 0..n {
if let Some(&cid) = root_to_cluster.get(&roots[i]) {
assignments[i] = Some(cid);
clusters
.get_mut(&cid)
.expect("cluster vec must exist")
.push(i);
}
}
ClusterLayer {
threshold,
assignments,
num_clusters: root_to_cluster.len(),
clusters,
}
}
fn detect_duplicates(
n: usize,
mst_edges: &[(usize, usize, f32)],
duplicate_threshold: f32,
) -> Vec<Vec<usize>> {
if n == 0 {
return Vec::new();
}
if duplicate_threshold <= 0.0 {
return Vec::new();
}
let mut uf = UnionFind::new(n);
for &(u, v, d) in mst_edges {
if d <= duplicate_threshold {
uf.union(u, v);
} else {
break;
}
}
let mut groups: HashMap<usize, Vec<usize>> = HashMap::new();
for i in 0..n {
let r = uf.find(i);
groups.entry(r).or_default().push(i);
}
let mut out: Vec<Vec<usize>> = groups.into_values().filter(|g| g.len() > 1).collect();
out.sort_by(|a, b| a[0].cmp(&b[0]));
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn evoc_smoke_two_clusters() {
let data = vec![
vec![0.0, 0.0],
vec![0.1, 0.1],
vec![0.2, 0.0],
vec![10.0, 10.0],
vec![10.1, 10.1],
vec![9.9, 10.2],
];
let params = EVoCParams {
intermediate_dim: 1,
min_cluster_size: 2,
max_layers: 8,
noise_level: 0.2,
duplicate_threshold: 1e-6,
seed: Some(42),
..Default::default()
};
let mut evoc = EVoC::new(params);
let labels = evoc.fit_predict(&data).unwrap();
assert_eq!(labels.len(), data.len());
assert!(!evoc.cluster_layers().is_empty());
assert!(evoc.cluster_tree().is_some());
assert!(evoc.duplicates().is_empty());
assert!(
evoc.cluster_layers().iter().any(|l| l.num_clusters >= 2),
"expected at least one multi-cluster layer"
);
}
#[test]
fn evoc_detects_duplicates() {
let data = vec![vec![1.0, 0.0], vec![1.0, 0.0], vec![0.0, 1.0]];
let mut evoc = EVoC::new(EVoCParams {
intermediate_dim: 1,
min_cluster_size: 1,
max_layers: 4,
noise_level: 0.5,
duplicate_threshold: 1e-6,
seed: Some(7),
..Default::default()
});
let _ = evoc.fit_predict(&data).unwrap();
let dups = evoc.duplicates();
assert!(
dups.iter()
.any(|g| g.len() == 2 && g.contains(&0) && g.contains(&1)),
"expected points 0 and 1 to be flagged as duplicates"
);
}
#[test]
fn evoc_layer_helpers() {
let d = 16usize;
let p0 = vec![0.0f32; d];
let mut p1 = vec![0.0f32; d];
p1[0] = 0.1;
let mut p2 = vec![0.0f32; d];
p2[1] = 0.1;
let q0 = vec![1000.0f32; d];
let mut q1 = vec![1000.0f32; d];
q1[0] = 1000.1;
let mut q2 = vec![1000.0f32; d];
q2[1] = 1000.1;
let data = vec![p0, p1, p2, q0, q1, q2];
let mut evoc = EVoC::new(EVoCParams {
intermediate_dim: 15,
min_cluster_size: 2,
max_layers: 8,
noise_level: 0.2,
duplicate_threshold: 1e-6,
seed: Some(42),
..Default::default()
});
evoc.fit(&data).unwrap();
assert!(evoc.finest_layer().is_some());
assert!(evoc.coarsest_layer().is_some());
assert_eq!(evoc.labels_finest().unwrap().len(), data.len());
assert_eq!(evoc.labels_coarsest().unwrap().len(), data.len());
let layer0 = evoc.layer_at_threshold(0.0).unwrap();
assert_eq!(layer0.num_clusters, 0);
assert!(layer0.assignments.iter().all(|a| a.is_none()));
let layer2 = evoc.layer_for_n_clusters(2).unwrap();
assert_eq!(layer2.num_clusters, 2);
let a0 = layer2.assignments[0].expect("point 0 should not be noise");
let a1 = layer2.assignments[1].expect("point 1 should not be noise");
let a2 = layer2.assignments[2].expect("point 2 should not be noise");
let b0 = layer2.assignments[3].expect("point 3 should not be noise");
let b1 = layer2.assignments[4].expect("point 4 should not be noise");
let b2 = layer2.assignments[5].expect("point 5 should not be noise");
assert_eq!(a0, a1);
assert_eq!(a1, a2);
assert_eq!(b0, b1);
assert_eq!(b1, b2);
assert_ne!(a0, b0);
}
#[test]
fn nan_input_rejected() {
let data = vec![vec![0.0, f32::NAN], vec![1.0, 1.0], vec![2.0, 2.0]];
let mut evoc = EVoC::new(EVoCParams {
intermediate_dim: 1,
min_cluster_size: 1,
..Default::default()
});
let result = evoc.fit_predict(&data);
assert!(result.is_err());
}
#[test]
fn inf_input_rejected() {
let data = vec![vec![0.0, 0.0], vec![f32::INFINITY, 1.0], vec![2.0, 2.0]];
let mut evoc = EVoC::new(EVoCParams {
intermediate_dim: 1,
min_cluster_size: 1,
..Default::default()
});
let result = evoc.fit_predict(&data);
assert!(result.is_err());
}
mod proptests {
use super::*;
use proptest::prelude::*;
fn arb_data(max_n: usize, d: usize) -> impl Strategy<Value = Vec<Vec<f32>>> {
proptest::collection::vec(proptest::collection::vec(-10.0f32..10.0, d..=d), 4..=max_n)
}
fn evoc_params() -> EVoCParams {
EVoCParams {
intermediate_dim: 1,
min_cluster_size: 2,
seed: Some(42),
..Default::default()
}
}
proptest! {
#[test]
fn labels_valid(data in arb_data(12, 2)) {
let n = data.len();
let mut evoc = EVoC::new(evoc_params());
let labels = evoc.fit_predict(&data).unwrap();
for (i, label) in labels.iter().enumerate() {
if let Some(cid) = label {
prop_assert!(*cid < n,
"point {} has cluster_id {} >= n={}", i, cid, n);
}
}
}
#[test]
fn label_count_matches_data(data in arb_data(12, 3)) {
let n = data.len();
let mut evoc = EVoC::new(evoc_params());
let labels = evoc.fit_predict(&data).unwrap();
prop_assert_eq!(labels.len(), n,
"expected {} labels, got {}", n, labels.len());
}
#[test]
fn hierarchy_layers_valid(data in arb_data(12, 2)) {
let n = data.len();
let mut evoc = EVoC::new(evoc_params());
evoc.fit(&data).unwrap();
for (li, layer) in evoc.cluster_layers().iter().enumerate() {
prop_assert_eq!(layer.assignments.len(), n,
"layer {} assignments length {} != n={}", li, layer.assignments.len(), n);
for (i, label) in layer.assignments.iter().enumerate() {
if let Some(cid) = label {
prop_assert!(*cid < layer.num_clusters,
"layer {} point {} has cluster_id {} >= num_clusters={}",
li, i, cid, layer.num_clusters);
}
}
}
}
#[test]
fn all_identical_no_crash(
n in 4usize..12,
x in -10.0f32..10.0,
y in -10.0f32..10.0,
) {
let point = vec![x, y];
let data = vec![point; n];
let mut evoc = EVoC::new(evoc_params());
let result = evoc.fit_predict(&data);
prop_assert!(result.is_ok(), "fit_predict panicked or errored on identical points");
}
}
}
}