1#![allow(clippy::needless_range_loop)]
13#![warn(missing_docs)]
14
15pub mod clustering;
16pub mod graph;
17pub mod prelude;
18pub mod utils;
19
20use ann_search_rs::cpu::hnsw::{HnswIndex, HnswState};
21use ann_search_rs::cpu::nndescent::{ApplySortedUpdates, NNDescent, NNDescentQuery};
22use ann_search_rs::prelude::AnnSearchFloat;
23use faer::MatRef;
24use manifolds_rs::PreComputedKnn;
25use manifolds_rs::data::nearest_neighbours::*;
26use std::time::Instant;
27
28#[cfg(feature = "gpu")]
29use ann_search_rs::gpu::nndescent_gpu::NNDescentGpu;
30#[cfg(feature = "gpu")]
31use ann_search_rs::gpu::traits_gpu::AnnSearchGpuFloat;
32#[cfg(feature = "gpu")]
33use cubecl::prelude::*;
34#[cfg(feature = "gpu")]
35use manifolds_rs::data::nearest_neighbours_gpu::*;
36
37use crate::clustering::condensed_tree::*;
38use crate::clustering::linkage::mst_to_linkage_tree;
39use crate::clustering::mst::build_mst;
40use crate::clustering::persistence::build_cluster_layers;
41use crate::graph::embedding::*;
42use crate::graph::fuzzy_graph::*;
43use crate::prelude::*;
44
45#[derive(Clone, Debug)]
51pub struct EvocParams<T> {
52 pub n_neighbours: usize,
54 pub noise_level: T,
57 pub n_epochs: usize,
59 pub embedding_dim: Option<usize>,
62 pub neighbour_scale: T,
64 pub symmetrise: bool,
66 pub min_samples: usize,
68 pub base_min_cluster_size: usize,
70 pub approx_n_clusters: Option<usize>,
73 pub min_similarity_threshold: f64,
75 pub max_layers: usize,
77}
78
79impl<T: EvocFloat> Default for EvocParams<T> {
81 fn default() -> Self {
82 Self {
83 n_neighbours: 15,
84 noise_level: T::from(0.5).unwrap(),
85 n_epochs: 50,
86 embedding_dim: None,
87 neighbour_scale: T::one(),
88 symmetrise: true,
89 min_samples: 5,
90 base_min_cluster_size: 5,
91 approx_n_clusters: None,
92 min_similarity_threshold: 0.2,
93 max_layers: 10,
94 }
95 }
96}
97
98pub struct EvocResult<T> {
104 pub cluster_layers: Vec<Vec<i64>>,
107 pub membership_strengths: Vec<Vec<T>>,
109 pub persistence_scores: Vec<f64>,
111 pub nn_indices: Vec<Vec<usize>>,
113 pub nn_distances: Vec<Vec<T>>,
115}
116
117impl<T: EvocFloat> EvocResult<T> {
118 pub fn best_labels(&self) -> &[i64] {
122 if self.cluster_layers.len() <= 1 {
123 &self.cluster_layers[0]
124 } else {
125 let best = self
126 .persistence_scores
127 .iter()
128 .enumerate()
129 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
130 .map(|(i, _)| i)
131 .unwrap_or(0);
132 &self.cluster_layers[best]
133 }
134 }
135
136 pub fn best_strengths(&self) -> &[T] {
139 if self.membership_strengths.len() <= 1 {
140 &self.membership_strengths[0]
141 } else {
142 let best = self
143 .persistence_scores
144 .iter()
145 .enumerate()
146 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
147 .map(|(i, _)| i)
148 .unwrap_or(0);
149 &self.membership_strengths[best]
150 }
151 }
152
153 pub fn n_clusters(&self) -> usize {
155 let labels = self.best_labels();
156 (labels.iter().copied().reduce(i64::max).unwrap_or(-1) + 1).max(0) as usize
157 }
158}
159
160pub fn evoc<T>(
195 data: MatRef<T>,
196 ann_type: String,
197 precomputed_knn: PreComputedKnn<T>,
198 evoc_params: &EvocParams<T>,
199 nn_params: &NearestNeighbourParams<T>,
200 seed: usize,
201 verbose: bool,
202) -> EvocResult<T>
203where
204 T: EvocFloat + AnnSearchFloat,
205 NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
206 HnswIndex<T>: HnswState<T>,
207{
208 let start_all = Instant::now();
209
210 let (knn_indices, knn_dist) = match precomputed_knn {
212 Some((indices, distances)) => {
213 if verbose {
214 println!("Using precomputed kNN graph...");
215 }
216 (indices, distances)
217 }
218 None => {
219 if verbose {
220 println!(
221 "Running approximate nearest neighbour search using {}...",
222 ann_type
223 );
224 }
225 let start_knn = Instant::now();
226 let result = run_ann_search(
227 data,
228 evoc_params.n_neighbours,
229 ann_type,
230 nn_params,
231 seed,
232 verbose,
233 );
234 if verbose {
235 println!("kNN search done in {:.2?}.", start_knn.elapsed());
236 }
237 result
238 }
239 };
240
241 if verbose {
243 println!("Constructing fuzzy simplicial set...");
244 }
245 let start_graph = Instant::now();
246 let effective_k = evoc_params.neighbour_scale * T::from(evoc_params.n_neighbours).unwrap();
247 let graph =
248 build_fuzzy_simplicial_set(&knn_indices, &knn_dist, effective_k, evoc_params.symmetrise);
249 let adj = coo_to_adjacency_list(&graph);
250 if verbose {
251 println!(
252 "Fuzzy simplicial set done in {:.2?}.",
253 start_graph.elapsed()
254 );
255 }
256
257 let dim = evoc_params
260 .embedding_dim
261 .unwrap_or_else(|| (evoc_params.n_neighbours / 4).clamp(4, 16));
262
263 let start_init = Instant::now();
265 let n = data.nrows();
266 let d = data.ncols();
267 let data_vecs: Vec<Vec<T>> = (0..n)
268 .map(|i| (0..d).map(|j| data[(i, j)]).collect())
269 .collect();
270
271 if verbose {
272 println!("Computing label propagation initialisation...");
273 }
274 let initial_embedding = crate::graph::label_prop::label_propagation_init(
275 &graph,
276 dim,
277 Some(&data_vecs),
278 seed as u64,
279 verbose,
280 );
281 if verbose {
282 println!("Label prop init done in {:.2?}.", start_init.elapsed());
283 }
284
285 if verbose {
287 println!(
288 "Computing {}-d node embedding ({} epochs)...",
289 dim, evoc_params.n_epochs
290 );
291 }
292 let start_embed = Instant::now();
293 let embed_params = EvocEmbeddingParams {
294 n_epochs: evoc_params.n_epochs,
295 noise_level: evoc_params.noise_level,
296 initial_alpha: T::from(0.1).unwrap(),
297 ..EvocEmbeddingParams::default()
298 };
299
300 let embedding = evoc_embedding(
301 &adj,
302 dim,
303 &embed_params,
304 Some(&initial_embedding),
305 seed as u64,
306 verbose,
307 );
308 if verbose {
309 println!("Embedding done in {:.2?}.", start_embed.elapsed());
310 }
311
312 if verbose {
314 println!("Running density-based clustering...");
315 }
316 let start_cluster = Instant::now();
317
318 let (cluster_layers, membership_strengths, persistence_scores) =
319 if let Some(target_k) = evoc_params.approx_n_clusters {
320 let (labels, strengths) =
321 search_for_n_clusters(&embedding, evoc_params.min_samples, target_k);
322 (vec![labels], vec![strengths], vec![0.0])
323 } else {
324 build_cluster_layers(
325 &embedding,
326 evoc_params.min_samples,
327 evoc_params.base_min_cluster_size,
328 evoc_params.min_similarity_threshold,
329 evoc_params.max_layers,
330 )
331 };
332
333 if verbose {
334 let n_layers = cluster_layers.len();
335 println!(
336 "Clustering done in {:.2?}: {} layer(s).",
337 start_cluster.elapsed(),
338 n_layers,
339 );
340 println!("EVoC total: {:.2?}.", start_all.elapsed());
341 }
342
343 EvocResult {
344 cluster_layers,
345 membership_strengths,
346 persistence_scores,
347 nn_indices: knn_indices,
348 nn_distances: knn_dist,
349 }
350}
351
352pub fn search_for_n_clusters<T>(
376 embedding: &[Vec<T>],
377 min_samples: usize,
378 target_k: usize,
379) -> (Vec<i64>, Vec<T>)
380where
381 T: EvocFloat,
382{
383 let n = embedding.len();
384 if n == 0 {
385 return (Vec::new(), Vec::new());
386 }
387
388 let mut mst = build_mst(embedding, min_samples);
389 let linkage = mst_to_linkage_tree(&mut mst, n);
390
391 let mut lo = 2usize;
392 let mut hi = n / 2;
393
394 while hi - lo > 1 {
395 let mid = (lo + hi) / 2;
396 if mid == lo || mid == hi {
397 break;
398 }
399
400 let ct_mid = condense_tree(&linkage, n, mid);
401 let leaves_mid = extract_leaves(&ct_mid);
402 let mid_k = leaves_mid.len();
403
404 if mid_k < target_k {
405 hi = mid;
407 } else {
408 lo = mid;
410 }
411 }
412
413 let ct_lo = condense_tree(&linkage, n, lo);
415 let leaves_lo = extract_leaves(&ct_lo);
416 let labels_lo = get_cluster_label_vector(&ct_lo, &leaves_lo, n);
417 let lo_k = leaves_lo.len();
418
419 let ct_hi = condense_tree(&linkage, n, hi);
420 let leaves_hi = extract_leaves(&ct_hi);
421 let labels_hi = get_cluster_label_vector(&ct_hi, &leaves_hi, n);
422 let hi_k = leaves_hi.len();
423
424 let lo_diff = (lo_k as isize - target_k as isize).unsigned_abs();
425 let hi_diff = (hi_k as isize - target_k as isize).unsigned_abs();
426
427 if lo_diff < hi_diff {
428 let strengths = get_point_membership_strengths(&ct_lo, &leaves_lo, &labels_lo);
429 (labels_lo, strengths)
430 } else if hi_diff < lo_diff {
431 let strengths = get_point_membership_strengths(&ct_hi, &leaves_hi, &labels_hi);
432 (labels_hi, strengths)
433 } else {
434 let lo_assigned = labels_lo.iter().filter(|&&l| l >= 0).count();
436 let hi_assigned = labels_hi.iter().filter(|&&l| l >= 0).count();
437 if lo_assigned >= hi_assigned {
438 let strengths = get_point_membership_strengths(&ct_lo, &leaves_lo, &labels_lo);
439 (labels_lo, strengths)
440 } else {
441 let strengths = get_point_membership_strengths(&ct_hi, &leaves_hi, &labels_hi);
442 (labels_hi, strengths)
443 }
444 }
445}
446
447#[allow(clippy::too_many_arguments)]
475#[cfg(feature = "gpu")]
476pub fn evoc_gpu<T, R>(
477 data: MatRef<T>,
478 ann_type: String,
479 precomputed_knn: PreComputedKnn<T>,
480 evoc_params: &EvocParams<T>,
481 nn_params: &NearestNeighbourParamsGpu<T>,
482 device: R::Device,
483 seed: usize,
484 verbose: bool,
485) -> EvocResult<T>
486where
487 T: EvocFloat + AnnSearchFloat + AnnSearchGpuFloat,
488 R: Runtime,
489 NNDescentGpu<T, R>: NNDescentQuery<T>,
490{
491 let start_all = Instant::now();
492
493 let (knn_indices, knn_dist) = match precomputed_knn {
495 Some((indices, distances)) => {
496 if verbose {
497 println!("Using precomputed kNN graph...");
498 }
499 (indices, distances)
500 }
501 None => {
502 if verbose {
503 println!("Running GPU nearest neighbour search using {}...", ann_type);
504 }
505 let start_knn = Instant::now();
506 let result = run_ann_search_gpu::<T, R>(
507 data,
508 evoc_params.n_neighbours,
509 ann_type,
510 nn_params,
511 device,
512 seed,
513 verbose,
514 );
515 if verbose {
516 println!("GPU kNN search done in {:.2?}.", start_knn.elapsed());
517 }
518 result
519 }
520 };
521
522 if verbose {
524 println!("Constructing fuzzy simplicial set...");
525 }
526 let start_graph = Instant::now();
527 let effective_k = evoc_params.neighbour_scale * T::from(evoc_params.n_neighbours).unwrap();
528 let graph =
529 build_fuzzy_simplicial_set(&knn_indices, &knn_dist, effective_k, evoc_params.symmetrise);
530 let adj = coo_to_adjacency_list(&graph);
531 if verbose {
532 println!(
533 "Fuzzy simplicial set done in {:.2?}.",
534 start_graph.elapsed()
535 );
536 }
537
538 let dim = evoc_params
540 .embedding_dim
541 .unwrap_or_else(|| (evoc_params.n_neighbours / 4).clamp(4, 16));
542
543 let start_init = Instant::now();
545 let n = data.nrows();
546 let d = data.ncols();
547 let data_vecs: Vec<Vec<T>> = (0..n)
548 .map(|i| (0..d).map(|j| data[(i, j)]).collect())
549 .collect();
550
551 if verbose {
552 println!("Computing label propagation initialisation...");
553 }
554 let initial_embedding = crate::graph::label_prop::label_propagation_init(
555 &graph,
556 dim,
557 Some(&data_vecs),
558 seed as u64,
559 verbose,
560 );
561 if verbose {
562 println!("Label prop init done in {:.2?}.", start_init.elapsed());
563 }
564
565 if verbose {
567 println!(
568 "Computing {}-d node embedding ({} epochs)...",
569 dim, evoc_params.n_epochs
570 );
571 }
572 let start_embed = Instant::now();
573 let embed_params = EvocEmbeddingParams {
574 n_epochs: evoc_params.n_epochs,
575 noise_level: evoc_params.noise_level,
576 initial_alpha: T::from(0.1).unwrap(),
577 ..EvocEmbeddingParams::default()
578 };
579
580 let embedding = evoc_embedding(
581 &adj,
582 dim,
583 &embed_params,
584 Some(&initial_embedding),
585 seed as u64,
586 verbose,
587 );
588 if verbose {
589 println!("Embedding done in {:.2?}.", start_embed.elapsed());
590 }
591
592 if verbose {
594 println!("Running density-based clustering...");
595 }
596 let start_cluster = Instant::now();
597
598 let (cluster_layers, membership_strengths, persistence_scores) =
599 if let Some(target_k) = evoc_params.approx_n_clusters {
600 let (labels, strengths) =
601 search_for_n_clusters(&embedding, evoc_params.min_samples, target_k);
602 (vec![labels], vec![strengths], vec![0.0])
603 } else {
604 build_cluster_layers(
605 &embedding,
606 evoc_params.min_samples,
607 evoc_params.base_min_cluster_size,
608 evoc_params.min_similarity_threshold,
609 evoc_params.max_layers,
610 )
611 };
612
613 if verbose {
614 let n_layers = cluster_layers.len();
615 println!(
616 "Clustering done in {:.2?}: {} layer(s).",
617 start_cluster.elapsed(),
618 n_layers,
619 );
620 println!("EVoC (GPU) total: {:.2?}.", start_all.elapsed());
621 }
622
623 EvocResult {
624 cluster_layers,
625 membership_strengths,
626 persistence_scores,
627 nn_indices: knn_indices,
628 nn_distances: knn_dist,
629 }
630}