Skip to main content

evoc_rs/
lib.rs

1//! EVoC - Embedding Vector Oriented Clustering
2//!
3//! Efficient clustering of high-dimensional embedding vectors (CLIP, sentence
4//! transformers, etc.) by combining a UMAP-like node embedding with
5//! HDBSCAN-style density-based clustering and multi-layer persistence
6//! analysis. This is the Rust version/port which allows for different
7//! approximate nearest neighbour search algorithms (for details, see
8//! [ann-search-rs](https://crates.io/crates/ann-search-rs)).
9//! This code is based on the original code from Leland McInnes, see the Python
10//! implementation: [evoc](https://github.com/TutteInstitute/evoc)
11
12#![allow(clippy::needless_range_loop)]
13#![warn(missing_docs)]
14
15pub mod clustering;
16pub mod graph;
17pub mod prelude;
18pub mod utils;
19
20use ann_search_rs::cpu::hnsw::{HnswIndex, HnswState};
21use ann_search_rs::cpu::nndescent::{ApplySortedUpdates, NNDescent, NNDescentQuery};
22use ann_search_rs::prelude::AnnSearchFloat;
23use faer::MatRef;
24use manifolds_rs::PreComputedKnn;
25use manifolds_rs::data::nearest_neighbours::*;
26use std::time::Instant;
27
28#[cfg(feature = "gpu")]
29use ann_search_rs::gpu::nndescent_gpu::NNDescentGpu;
30#[cfg(feature = "gpu")]
31use ann_search_rs::gpu::traits_gpu::AnnSearchGpuFloat;
32#[cfg(feature = "gpu")]
33use cubecl::prelude::*;
34#[cfg(feature = "gpu")]
35use manifolds_rs::data::nearest_neighbours_gpu::*;
36
37use crate::clustering::condensed_tree::*;
38use crate::clustering::linkage::mst_to_linkage_tree;
39use crate::clustering::mst::build_mst;
40use crate::clustering::persistence::build_cluster_layers;
41use crate::graph::embedding::*;
42use crate::graph::fuzzy_graph::*;
43use crate::prelude::*;
44
45////////////
46// Params //
47////////////
48
49/// Parameters for EVoC clustering.
50#[derive(Clone, Debug)]
51pub struct EvocParams<T> {
52    /// Number of nearest neighbours for graph construction.
53    pub n_neighbours: usize,
54    /// Noise level for the embedding gradient (0.0 = aggressive, 1.0 =
55    /// conservative).
56    pub noise_level: T,
57    /// Number of embedding optimisation epochs.
58    pub n_epochs: usize,
59    /// Embedding dimensionality. If `None`, defaults to
60    /// `min(max(n_neighbours / 4, 4), 16)`.
61    pub embedding_dim: Option<usize>,
62    /// Multiplier on effective neighbours for fuzzy graph construction.
63    pub neighbour_scale: T,
64    /// Whether to symmetrise the fuzzy graph.
65    pub symmetrise: bool,
66    /// Minimum samples for core distance in MST density estimation.
67    pub min_samples: usize,
68    /// Base minimum cluster size for the finest layer.
69    pub base_min_cluster_size: usize,
70    /// If set, binary-search for approximately this many clusters (single layer
71    /// output).
72    pub approx_n_clusters: Option<usize>,
73    /// Jaccard similarity threshold for filtering redundant layers.
74    pub min_similarity_threshold: f64,
75    /// Maximum number of cluster layers to return.
76    pub max_layers: usize,
77}
78
79/// Default implementation
80impl<T: EvocFloat> Default for EvocParams<T> {
81    fn default() -> Self {
82        Self {
83            n_neighbours: 15,
84            noise_level: T::from(0.5).unwrap(),
85            n_epochs: 50,
86            embedding_dim: None,
87            neighbour_scale: T::one(),
88            symmetrise: true,
89            min_samples: 5,
90            base_min_cluster_size: 5,
91            approx_n_clusters: None,
92            min_similarity_threshold: 0.2,
93            max_layers: 10,
94        }
95    }
96}
97
98/////////////
99// Results //
100/////////////
101
102/// Result of EVoC clustering.
103pub struct EvocResult<T> {
104    /// Cluster labels per layer, sorted finest (most clusters) first.
105    /// -1 indicates noise.
106    pub cluster_layers: Vec<Vec<i64>>,
107    /// Membership strengths per layer, in [0, 1].
108    pub membership_strengths: Vec<Vec<T>>,
109    /// Persistence score per layer (higher = more stable).
110    pub persistence_scores: Vec<f64>,
111    /// k-NN indices (excluding self).
112    pub nn_indices: Vec<Vec<usize>>,
113    /// k-NN distances (excluding self).
114    pub nn_distances: Vec<Vec<T>>,
115}
116
117impl<T: EvocFloat> EvocResult<T> {
118    /// Returns labels from the layer with the highest persistence score.
119    ///
120    /// Falls back to the only available layer when there is just one.
121    pub fn best_labels(&self) -> &[i64] {
122        if self.cluster_layers.len() <= 1 {
123            &self.cluster_layers[0]
124        } else {
125            let best = self
126                .persistence_scores
127                .iter()
128                .enumerate()
129                .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
130                .map(|(i, _)| i)
131                .unwrap_or(0);
132            &self.cluster_layers[best]
133        }
134    }
135
136    /// Returns membership strengths corresponding to
137    /// [`EvocResult::best_labels`].
138    pub fn best_strengths(&self) -> &[T] {
139        if self.membership_strengths.len() <= 1 {
140            &self.membership_strengths[0]
141        } else {
142            let best = self
143                .persistence_scores
144                .iter()
145                .enumerate()
146                .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
147                .map(|(i, _)| i)
148                .unwrap_or(0);
149            &self.membership_strengths[best]
150        }
151    }
152
153    /// Number of clusters in the best layer, excluding noise points.
154    pub fn n_clusters(&self) -> usize {
155        let labels = self.best_labels();
156        (labels.iter().copied().reduce(i64::max).unwrap_or(-1) + 1).max(0) as usize
157    }
158}
159
160//////////
161// Main //
162//////////
163
164/// Run EVoC clustering on high-dimensional embedding data.
165///
166/// 1. **kNN graph** — builds an approximate nearest-neighbour graph via the
167///    selected ANN backend (`ann_type`), or uses `precomputed_knn` if
168///    provided.
169/// 2. **Fuzzy simplicial set** — smooths and optionally symmetrises the kNN
170///    graph into a weighted undirected graph.
171/// 3. **Node embedding** — optimises a low-dimensional layout using the EVoC
172///    gradient (a modified UMAP repulsion term controlled by `noise_level`).
173/// 4. **MST** — builds a minimum spanning tree over the embedding using
174///    mutual reachability distances.
175/// 5. **Cluster layers** — extracts a hierarchy of clusterings from the MST
176///    via persistence analysis, or searches for a fixed number of clusters if
177///    [`EvocParams::approx_n_clusters`] is set.
178///
179/// ### Params
180///
181/// * `data` — input matrix with shape `(n_points, n_features)`.
182/// * `ann_type` — ANN backend identifier (e.g. `"nndescent"`, `"hnsw"`).
183/// * `precomputed_knn` — pre-built `(indices, distances)` pair; pass `None`
184///   to build the graph from `data`.
185/// * `evoc_params` — clustering hyperparameters; see [`EvocParams`].
186/// * `nn_params` — hyperparameters forwarded to the ANN backend.
187/// * `seed` — random seed for reproducibility.
188/// * `verbose` — print progress and timing to stdout.
189///
190/// ### Returns
191///
192/// An [`EvocResult`] containing cluster layers, membership strengths,
193/// persistence scores, and the kNN graph.
194pub fn evoc<T>(
195    data: MatRef<T>,
196    ann_type: String,
197    precomputed_knn: PreComputedKnn<T>,
198    evoc_params: &EvocParams<T>,
199    nn_params: &NearestNeighbourParams<T>,
200    seed: usize,
201    verbose: bool,
202) -> EvocResult<T>
203where
204    T: EvocFloat + AnnSearchFloat,
205    NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
206    HnswIndex<T>: HnswState<T>,
207{
208    let start_all = Instant::now();
209
210    // 1. kNN graph
211    let (knn_indices, knn_dist) = match precomputed_knn {
212        Some((indices, distances)) => {
213            if verbose {
214                println!("Using precomputed kNN graph...");
215            }
216            (indices, distances)
217        }
218        None => {
219            if verbose {
220                println!(
221                    "Running approximate nearest neighbour search using {}...",
222                    ann_type
223                );
224            }
225            let start_knn = Instant::now();
226            let result = run_ann_search(
227                data,
228                evoc_params.n_neighbours,
229                ann_type,
230                nn_params,
231                seed,
232                verbose,
233            );
234            if verbose {
235                println!("kNN search done in {:.2?}.", start_knn.elapsed());
236            }
237            result
238        }
239    };
240
241    // 2. Fuzzy simplicial set
242    if verbose {
243        println!("Constructing fuzzy simplicial set...");
244    }
245    let start_graph = Instant::now();
246    let effective_k = evoc_params.neighbour_scale * T::from(evoc_params.n_neighbours).unwrap();
247    let graph =
248        build_fuzzy_simplicial_set(&knn_indices, &knn_dist, effective_k, evoc_params.symmetrise);
249    let adj = coo_to_adjacency_list(&graph);
250    if verbose {
251        println!(
252            "Fuzzy simplicial set done in {:.2?}.",
253            start_graph.elapsed()
254        );
255    }
256
257    // 3. Embedding dimensionality
258    // original code did 4 to 15 -> went up one for SIMD
259    let dim = evoc_params
260        .embedding_dim
261        .unwrap_or_else(|| (evoc_params.n_neighbours / 4).clamp(4, 16));
262
263    // 4. Label propagation initialisation
264    let start_init = Instant::now();
265    let n = data.nrows();
266    let d = data.ncols();
267    let data_vecs: Vec<Vec<T>> = (0..n)
268        .map(|i| (0..d).map(|j| data[(i, j)]).collect())
269        .collect();
270
271    if verbose {
272        println!("Computing label propagation initialisation...");
273    }
274    let initial_embedding = crate::graph::label_prop::label_propagation_init(
275        &graph,
276        dim,
277        Some(&data_vecs),
278        seed as u64,
279        verbose,
280    );
281    if verbose {
282        println!("Label prop init done in {:.2?}.", start_init.elapsed());
283    }
284
285    // 5. Node embedding
286    if verbose {
287        println!(
288            "Computing {}-d node embedding ({} epochs)...",
289            dim, evoc_params.n_epochs
290        );
291    }
292    let start_embed = Instant::now();
293    let embed_params = EvocEmbeddingParams {
294        n_epochs: evoc_params.n_epochs,
295        noise_level: evoc_params.noise_level,
296        initial_alpha: T::from(0.1).unwrap(),
297        ..EvocEmbeddingParams::default()
298    };
299
300    let embedding = evoc_embedding(
301        &adj,
302        dim,
303        &embed_params,
304        Some(&initial_embedding),
305        seed as u64,
306        verbose,
307    );
308    if verbose {
309        println!("Embedding done in {:.2?}.", start_embed.elapsed());
310    }
311
312    // 6. Clustering
313    if verbose {
314        println!("Running density-based clustering...");
315    }
316    let start_cluster = Instant::now();
317
318    let (cluster_layers, membership_strengths, persistence_scores) =
319        if let Some(target_k) = evoc_params.approx_n_clusters {
320            let (labels, strengths) =
321                search_for_n_clusters(&embedding, evoc_params.min_samples, target_k);
322            (vec![labels], vec![strengths], vec![0.0])
323        } else {
324            build_cluster_layers(
325                &embedding,
326                evoc_params.min_samples,
327                evoc_params.base_min_cluster_size,
328                evoc_params.min_similarity_threshold,
329                evoc_params.max_layers,
330            )
331        };
332
333    if verbose {
334        let n_layers = cluster_layers.len();
335        println!(
336            "Clustering done in {:.2?}: {} layer(s).",
337            start_cluster.elapsed(),
338            n_layers,
339        );
340        println!("EVoC total: {:.2?}.", start_all.elapsed());
341    }
342
343    EvocResult {
344        cluster_layers,
345        membership_strengths,
346        persistence_scores,
347        nn_indices: knn_indices,
348        nn_distances: knn_dist,
349    }
350}
351
352/// Binary-searches over `min_cluster_size` to find approximately `target_k`
353/// clusters.
354///
355/// Builds the MST and linkage tree once from `embedding`, then re-condenses
356/// the tree at different `min_cluster_size` thresholds until the number of
357/// extracted leaves is close to `target_k`.
358///
359/// When the search converges, both boundary values (`lo` and `hi`) are
360/// evaluated and the one whose cluster count is closer to `target_k` is
361/// returned. Ties are broken in favour of whichever boundary assigns more
362/// points to non-noise clusters.
363///
364/// ### Params
365///
366/// * `embedding` — low-dimensional node positions, one `Vec<T>` per point.
367/// * `min_samples` — passed to [`build_mst`] for core-distance estimation.
368/// * `target_k` — desired number of clusters.
369///
370/// ### Returns
371///
372/// A `(labels, membership_strengths)` pair for the selected clustering.
373/// Labels follow the same convention as [`EvocResult::cluster_layers`]:
374/// `-1` is noise, non-negative integers are cluster IDs.
375pub fn search_for_n_clusters<T>(
376    embedding: &[Vec<T>],
377    min_samples: usize,
378    target_k: usize,
379) -> (Vec<i64>, Vec<T>)
380where
381    T: EvocFloat,
382{
383    let n = embedding.len();
384    if n == 0 {
385        return (Vec::new(), Vec::new());
386    }
387
388    let mut mst = build_mst(embedding, min_samples);
389    let linkage = mst_to_linkage_tree(&mut mst, n);
390
391    let mut lo = 2usize;
392    let mut hi = n / 2;
393
394    while hi - lo > 1 {
395        let mid = (lo + hi) / 2;
396        if mid == lo || mid == hi {
397            break;
398        }
399
400        let ct_mid = condense_tree(&linkage, n, mid);
401        let leaves_mid = extract_leaves(&ct_mid);
402        let mid_k = leaves_mid.len();
403
404        if mid_k < target_k {
405            // Need more clusters -> smaller min_cluster_size
406            hi = mid;
407        } else {
408            // Have enough or too many -> larger min_cluster_size
409            lo = mid;
410        }
411    }
412
413    // Pick whichever bound is closer to target
414    let ct_lo = condense_tree(&linkage, n, lo);
415    let leaves_lo = extract_leaves(&ct_lo);
416    let labels_lo = get_cluster_label_vector(&ct_lo, &leaves_lo, n);
417    let lo_k = leaves_lo.len();
418
419    let ct_hi = condense_tree(&linkage, n, hi);
420    let leaves_hi = extract_leaves(&ct_hi);
421    let labels_hi = get_cluster_label_vector(&ct_hi, &leaves_hi, n);
422    let hi_k = leaves_hi.len();
423
424    let lo_diff = (lo_k as isize - target_k as isize).unsigned_abs();
425    let hi_diff = (hi_k as isize - target_k as isize).unsigned_abs();
426
427    if lo_diff < hi_diff {
428        let strengths = get_point_membership_strengths(&ct_lo, &leaves_lo, &labels_lo);
429        (labels_lo, strengths)
430    } else if hi_diff < lo_diff {
431        let strengths = get_point_membership_strengths(&ct_hi, &leaves_hi, &labels_hi);
432        (labels_hi, strengths)
433    } else {
434        // Tie: prefer whichever has more non-noise points (matches Python)
435        let lo_assigned = labels_lo.iter().filter(|&&l| l >= 0).count();
436        let hi_assigned = labels_hi.iter().filter(|&&l| l >= 0).count();
437        if lo_assigned >= hi_assigned {
438            let strengths = get_point_membership_strengths(&ct_lo, &leaves_lo, &labels_lo);
439            (labels_lo, strengths)
440        } else {
441            let strengths = get_point_membership_strengths(&ct_hi, &leaves_hi, &labels_hi);
442            (labels_hi, strengths)
443        }
444    }
445}
446
447/////////////////
448// GPU version //
449/////////////////
450
451/// Run EVoC clustering with GPU-accelerated kNN search.
452///
453/// Identical to [`evoc`] except the kNN graph is constructed on the GPU via
454/// `manifolds-rs`'s GPU ANN backends. Fuzzy graph construction, node
455/// embedding, MST and persistence analysis all remain on the CPU.
456///
457/// ### Params
458///
459/// * `data` — input matrix with shape `(n_points, n_features)`.
460/// * `ann_type` — GPU ANN backend: `"exhaustive_gpu"`, `"ivf_gpu"` or
461///   `"nndescent_gpu"`.
462/// * `precomputed_knn` — pre-built `(indices, distances)` pair; pass `None`
463///   to build the graph on the GPU.
464/// * `evoc_params` — clustering hyperparameters; see [`EvocParams`].
465/// * `nn_params` — GPU nearest-neighbour search parameters.
466/// * `device` — GPU device.
467/// * `seed` — random seed for reproducibility.
468/// * `verbose` — print progress and timing to stdout.
469///
470/// ### Returns
471///
472/// An [`EvocResult`] containing cluster layers, membership strengths,
473/// persistence scores, and the kNN graph.
474#[allow(clippy::too_many_arguments)]
475#[cfg(feature = "gpu")]
476pub fn evoc_gpu<T, R>(
477    data: MatRef<T>,
478    ann_type: String,
479    precomputed_knn: PreComputedKnn<T>,
480    evoc_params: &EvocParams<T>,
481    nn_params: &NearestNeighbourParamsGpu<T>,
482    device: R::Device,
483    seed: usize,
484    verbose: bool,
485) -> EvocResult<T>
486where
487    T: EvocFloat + AnnSearchFloat + AnnSearchGpuFloat,
488    R: Runtime,
489    NNDescentGpu<T, R>: NNDescentQuery<T>,
490{
491    let start_all = Instant::now();
492
493    // 1. kNN graph (GPU)
494    let (knn_indices, knn_dist) = match precomputed_knn {
495        Some((indices, distances)) => {
496            if verbose {
497                println!("Using precomputed kNN graph...");
498            }
499            (indices, distances)
500        }
501        None => {
502            if verbose {
503                println!("Running GPU nearest neighbour search using {}...", ann_type);
504            }
505            let start_knn = Instant::now();
506            let result = run_ann_search_gpu::<T, R>(
507                data,
508                evoc_params.n_neighbours,
509                ann_type,
510                nn_params,
511                device,
512                seed,
513                verbose,
514            );
515            if verbose {
516                println!("GPU kNN search done in {:.2?}.", start_knn.elapsed());
517            }
518            result
519        }
520    };
521
522    // 2. Fuzzy simplicial set
523    if verbose {
524        println!("Constructing fuzzy simplicial set...");
525    }
526    let start_graph = Instant::now();
527    let effective_k = evoc_params.neighbour_scale * T::from(evoc_params.n_neighbours).unwrap();
528    let graph =
529        build_fuzzy_simplicial_set(&knn_indices, &knn_dist, effective_k, evoc_params.symmetrise);
530    let adj = coo_to_adjacency_list(&graph);
531    if verbose {
532        println!(
533            "Fuzzy simplicial set done in {:.2?}.",
534            start_graph.elapsed()
535        );
536    }
537
538    // 3. Embedding dimensionality
539    let dim = evoc_params
540        .embedding_dim
541        .unwrap_or_else(|| (evoc_params.n_neighbours / 4).clamp(4, 16));
542
543    // 4. Label propagation initialisation
544    let start_init = Instant::now();
545    let n = data.nrows();
546    let d = data.ncols();
547    let data_vecs: Vec<Vec<T>> = (0..n)
548        .map(|i| (0..d).map(|j| data[(i, j)]).collect())
549        .collect();
550
551    if verbose {
552        println!("Computing label propagation initialisation...");
553    }
554    let initial_embedding = crate::graph::label_prop::label_propagation_init(
555        &graph,
556        dim,
557        Some(&data_vecs),
558        seed as u64,
559        verbose,
560    );
561    if verbose {
562        println!("Label prop init done in {:.2?}.", start_init.elapsed());
563    }
564
565    // 5. Node embedding
566    if verbose {
567        println!(
568            "Computing {}-d node embedding ({} epochs)...",
569            dim, evoc_params.n_epochs
570        );
571    }
572    let start_embed = Instant::now();
573    let embed_params = EvocEmbeddingParams {
574        n_epochs: evoc_params.n_epochs,
575        noise_level: evoc_params.noise_level,
576        initial_alpha: T::from(0.1).unwrap(),
577        ..EvocEmbeddingParams::default()
578    };
579
580    let embedding = evoc_embedding(
581        &adj,
582        dim,
583        &embed_params,
584        Some(&initial_embedding),
585        seed as u64,
586        verbose,
587    );
588    if verbose {
589        println!("Embedding done in {:.2?}.", start_embed.elapsed());
590    }
591
592    // 6. Clustering
593    if verbose {
594        println!("Running density-based clustering...");
595    }
596    let start_cluster = Instant::now();
597
598    let (cluster_layers, membership_strengths, persistence_scores) =
599        if let Some(target_k) = evoc_params.approx_n_clusters {
600            let (labels, strengths) =
601                search_for_n_clusters(&embedding, evoc_params.min_samples, target_k);
602            (vec![labels], vec![strengths], vec![0.0])
603        } else {
604            build_cluster_layers(
605                &embedding,
606                evoc_params.min_samples,
607                evoc_params.base_min_cluster_size,
608                evoc_params.min_similarity_threshold,
609                evoc_params.max_layers,
610            )
611        };
612
613    if verbose {
614        let n_layers = cluster_layers.len();
615        println!(
616            "Clustering done in {:.2?}: {} layer(s).",
617            start_cluster.elapsed(),
618            n_layers,
619        );
620        println!("EVoC (GPU) total: {:.2?}.", start_all.elapsed());
621    }
622
623    EvocResult {
624        cluster_layers,
625        membership_strengths,
626        persistence_scores,
627        nn_indices: knn_indices,
628        nn_distances: knn_dist,
629    }
630}