#![allow(clippy::needless_range_loop)]
#![warn(missing_docs)]
pub mod clustering;
pub mod graph;
pub mod prelude;
pub mod utils;
use ann_search_rs::cpu::hnsw::{HnswIndex, HnswState};
use ann_search_rs::cpu::nndescent::{ApplySortedUpdates, NNDescent, NNDescentQuery};
use ann_search_rs::prelude::AnnSearchFloat;
use faer::MatRef;
use manifolds_rs::PreComputedKnn;
use manifolds_rs::data::nearest_neighbours::*;
use std::time::Instant;
#[cfg(feature = "gpu")]
use ann_search_rs::gpu::nndescent_gpu::NNDescentGpu;
#[cfg(feature = "gpu")]
use ann_search_rs::gpu::traits_gpu::AnnSearchGpuFloat;
#[cfg(feature = "gpu")]
use cubecl::prelude::*;
#[cfg(feature = "gpu")]
use manifolds_rs::data::nearest_neighbours_gpu::*;
use crate::clustering::condensed_tree::*;
use crate::clustering::linkage::mst_to_linkage_tree;
use crate::clustering::mst::build_mst;
use crate::clustering::persistence::build_cluster_layers;
use crate::graph::embedding::*;
use crate::graph::fuzzy_graph::*;
use crate::prelude::*;
#[derive(Clone, Debug)]
pub struct EvocParams<T> {
pub n_neighbours: usize,
pub noise_level: T,
pub n_epochs: usize,
pub embedding_dim: Option<usize>,
pub neighbour_scale: T,
pub symmetrise: bool,
pub min_samples: usize,
pub base_min_cluster_size: usize,
pub approx_n_clusters: Option<usize>,
pub min_similarity_threshold: f64,
pub max_layers: usize,
}
impl<T: EvocFloat> Default for EvocParams<T> {
fn default() -> Self {
Self {
n_neighbours: 15,
noise_level: T::from(0.5).unwrap(),
n_epochs: 50,
embedding_dim: None,
neighbour_scale: T::one(),
symmetrise: true,
min_samples: 5,
base_min_cluster_size: 5,
approx_n_clusters: None,
min_similarity_threshold: 0.2,
max_layers: 10,
}
}
}
pub struct EvocResult<T> {
pub cluster_layers: Vec<Vec<i64>>,
pub membership_strengths: Vec<Vec<T>>,
pub persistence_scores: Vec<f64>,
pub nn_indices: Vec<Vec<usize>>,
pub nn_distances: Vec<Vec<T>>,
}
impl<T: EvocFloat> EvocResult<T> {
pub fn best_labels(&self) -> &[i64] {
if self.cluster_layers.len() <= 1 {
&self.cluster_layers[0]
} else {
let best = self
.persistence_scores
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
.map(|(i, _)| i)
.unwrap_or(0);
&self.cluster_layers[best]
}
}
pub fn best_strengths(&self) -> &[T] {
if self.membership_strengths.len() <= 1 {
&self.membership_strengths[0]
} else {
let best = self
.persistence_scores
.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
.map(|(i, _)| i)
.unwrap_or(0);
&self.membership_strengths[best]
}
}
pub fn n_clusters(&self) -> usize {
let labels = self.best_labels();
(labels.iter().copied().reduce(i64::max).unwrap_or(-1) + 1).max(0) as usize
}
}
pub fn evoc<T>(
data: MatRef<T>,
ann_type: String,
precomputed_knn: PreComputedKnn<T>,
evoc_params: &EvocParams<T>,
nn_params: &NearestNeighbourParams<T>,
seed: usize,
verbose: bool,
) -> EvocResult<T>
where
T: EvocFloat + AnnSearchFloat,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
HnswIndex<T>: HnswState<T>,
{
let start_all = Instant::now();
let (knn_indices, knn_dist) = match precomputed_knn {
Some((indices, distances)) => {
if verbose {
println!("Using precomputed kNN graph...");
}
(indices, distances)
}
None => {
if verbose {
println!(
"Running approximate nearest neighbour search using {}...",
ann_type
);
}
let start_knn = Instant::now();
let result = run_ann_search(
data,
evoc_params.n_neighbours,
ann_type,
nn_params,
seed,
verbose,
);
if verbose {
println!("kNN search done in {:.2?}.", start_knn.elapsed());
}
result
}
};
if verbose {
println!("Constructing fuzzy simplicial set...");
}
let start_graph = Instant::now();
let effective_k = evoc_params.neighbour_scale * T::from(evoc_params.n_neighbours).unwrap();
let graph =
build_fuzzy_simplicial_set(&knn_indices, &knn_dist, effective_k, evoc_params.symmetrise);
let adj = coo_to_adjacency_list(&graph);
if verbose {
println!(
"Fuzzy simplicial set done in {:.2?}.",
start_graph.elapsed()
);
}
let dim = evoc_params
.embedding_dim
.unwrap_or_else(|| (evoc_params.n_neighbours / 4).clamp(4, 16));
let start_init = Instant::now();
let n = data.nrows();
let d = data.ncols();
let data_vecs: Vec<Vec<T>> = (0..n)
.map(|i| (0..d).map(|j| data[(i, j)]).collect())
.collect();
if verbose {
println!("Computing label propagation initialisation...");
}
let initial_embedding = crate::graph::label_prop::label_propagation_init(
&graph,
dim,
Some(&data_vecs),
seed as u64,
verbose,
);
if verbose {
println!("Label prop init done in {:.2?}.", start_init.elapsed());
}
if verbose {
println!(
"Computing {}-d node embedding ({} epochs)...",
dim, evoc_params.n_epochs
);
}
let start_embed = Instant::now();
let embed_params = EvocEmbeddingParams {
n_epochs: evoc_params.n_epochs,
noise_level: evoc_params.noise_level,
initial_alpha: T::from(0.1).unwrap(),
..EvocEmbeddingParams::default()
};
let embedding = evoc_embedding(
&adj,
dim,
&embed_params,
Some(&initial_embedding),
seed as u64,
verbose,
);
if verbose {
println!("Embedding done in {:.2?}.", start_embed.elapsed());
}
if verbose {
println!("Running density-based clustering...");
}
let start_cluster = Instant::now();
let (cluster_layers, membership_strengths, persistence_scores) =
if let Some(target_k) = evoc_params.approx_n_clusters {
let (labels, strengths) =
search_for_n_clusters(&embedding, evoc_params.min_samples, target_k);
(vec![labels], vec![strengths], vec![0.0])
} else {
build_cluster_layers(
&embedding,
evoc_params.min_samples,
evoc_params.base_min_cluster_size,
evoc_params.min_similarity_threshold,
evoc_params.max_layers,
)
};
if verbose {
let n_layers = cluster_layers.len();
println!(
"Clustering done in {:.2?}: {} layer(s).",
start_cluster.elapsed(),
n_layers,
);
println!("EVoC total: {:.2?}.", start_all.elapsed());
}
EvocResult {
cluster_layers,
membership_strengths,
persistence_scores,
nn_indices: knn_indices,
nn_distances: knn_dist,
}
}
pub fn search_for_n_clusters<T>(
embedding: &[Vec<T>],
min_samples: usize,
target_k: usize,
) -> (Vec<i64>, Vec<T>)
where
T: EvocFloat,
{
let n = embedding.len();
if n == 0 {
return (Vec::new(), Vec::new());
}
let mut mst = build_mst(embedding, min_samples);
let linkage = mst_to_linkage_tree(&mut mst, n);
let mut lo = 2usize;
let mut hi = n / 2;
while hi - lo > 1 {
let mid = (lo + hi) / 2;
if mid == lo || mid == hi {
break;
}
let ct_mid = condense_tree(&linkage, n, mid);
let leaves_mid = extract_leaves(&ct_mid);
let mid_k = leaves_mid.len();
if mid_k < target_k {
hi = mid;
} else {
lo = mid;
}
}
let ct_lo = condense_tree(&linkage, n, lo);
let leaves_lo = extract_leaves(&ct_lo);
let labels_lo = get_cluster_label_vector(&ct_lo, &leaves_lo, n);
let lo_k = leaves_lo.len();
let ct_hi = condense_tree(&linkage, n, hi);
let leaves_hi = extract_leaves(&ct_hi);
let labels_hi = get_cluster_label_vector(&ct_hi, &leaves_hi, n);
let hi_k = leaves_hi.len();
let lo_diff = (lo_k as isize - target_k as isize).unsigned_abs();
let hi_diff = (hi_k as isize - target_k as isize).unsigned_abs();
if lo_diff < hi_diff {
let strengths = get_point_membership_strengths(&ct_lo, &leaves_lo, &labels_lo);
(labels_lo, strengths)
} else if hi_diff < lo_diff {
let strengths = get_point_membership_strengths(&ct_hi, &leaves_hi, &labels_hi);
(labels_hi, strengths)
} else {
let lo_assigned = labels_lo.iter().filter(|&&l| l >= 0).count();
let hi_assigned = labels_hi.iter().filter(|&&l| l >= 0).count();
if lo_assigned >= hi_assigned {
let strengths = get_point_membership_strengths(&ct_lo, &leaves_lo, &labels_lo);
(labels_lo, strengths)
} else {
let strengths = get_point_membership_strengths(&ct_hi, &leaves_hi, &labels_hi);
(labels_hi, strengths)
}
}
}
#[allow(clippy::too_many_arguments)]
#[cfg(feature = "gpu")]
pub fn evoc_gpu<T, R>(
data: MatRef<T>,
ann_type: String,
precomputed_knn: PreComputedKnn<T>,
evoc_params: &EvocParams<T>,
nn_params: &NearestNeighbourParamsGpu<T>,
device: R::Device,
seed: usize,
verbose: bool,
) -> EvocResult<T>
where
T: EvocFloat + AnnSearchFloat + AnnSearchGpuFloat,
R: Runtime,
NNDescentGpu<T, R>: NNDescentQuery<T>,
{
let start_all = Instant::now();
let (knn_indices, knn_dist) = match precomputed_knn {
Some((indices, distances)) => {
if verbose {
println!("Using precomputed kNN graph...");
}
(indices, distances)
}
None => {
if verbose {
println!("Running GPU nearest neighbour search using {}...", ann_type);
}
let start_knn = Instant::now();
let result = run_ann_search_gpu::<T, R>(
data,
evoc_params.n_neighbours,
ann_type,
nn_params,
device,
seed,
verbose,
);
if verbose {
println!("GPU kNN search done in {:.2?}.", start_knn.elapsed());
}
result
}
};
if verbose {
println!("Constructing fuzzy simplicial set...");
}
let start_graph = Instant::now();
let effective_k = evoc_params.neighbour_scale * T::from(evoc_params.n_neighbours).unwrap();
let graph =
build_fuzzy_simplicial_set(&knn_indices, &knn_dist, effective_k, evoc_params.symmetrise);
let adj = coo_to_adjacency_list(&graph);
if verbose {
println!(
"Fuzzy simplicial set done in {:.2?}.",
start_graph.elapsed()
);
}
let dim = evoc_params
.embedding_dim
.unwrap_or_else(|| (evoc_params.n_neighbours / 4).clamp(4, 16));
let start_init = Instant::now();
let n = data.nrows();
let d = data.ncols();
let data_vecs: Vec<Vec<T>> = (0..n)
.map(|i| (0..d).map(|j| data[(i, j)]).collect())
.collect();
if verbose {
println!("Computing label propagation initialisation...");
}
let initial_embedding = crate::graph::label_prop::label_propagation_init(
&graph,
dim,
Some(&data_vecs),
seed as u64,
verbose,
);
if verbose {
println!("Label prop init done in {:.2?}.", start_init.elapsed());
}
if verbose {
println!(
"Computing {}-d node embedding ({} epochs)...",
dim, evoc_params.n_epochs
);
}
let start_embed = Instant::now();
let embed_params = EvocEmbeddingParams {
n_epochs: evoc_params.n_epochs,
noise_level: evoc_params.noise_level,
initial_alpha: T::from(0.1).unwrap(),
..EvocEmbeddingParams::default()
};
let embedding = evoc_embedding(
&adj,
dim,
&embed_params,
Some(&initial_embedding),
seed as u64,
verbose,
);
if verbose {
println!("Embedding done in {:.2?}.", start_embed.elapsed());
}
if verbose {
println!("Running density-based clustering...");
}
let start_cluster = Instant::now();
let (cluster_layers, membership_strengths, persistence_scores) =
if let Some(target_k) = evoc_params.approx_n_clusters {
let (labels, strengths) =
search_for_n_clusters(&embedding, evoc_params.min_samples, target_k);
(vec![labels], vec![strengths], vec![0.0])
} else {
build_cluster_layers(
&embedding,
evoc_params.min_samples,
evoc_params.base_min_cluster_size,
evoc_params.min_similarity_threshold,
evoc_params.max_layers,
)
};
if verbose {
let n_layers = cluster_layers.len();
println!(
"Clustering done in {:.2?}: {} layer(s).",
start_cluster.elapsed(),
n_layers,
);
println!("EVoC (GPU) total: {:.2?}.", start_all.elapsed());
}
EvocResult {
cluster_layers,
membership_strengths,
persistence_scores,
nn_indices: knn_indices,
nn_distances: knn_dist,
}
}