#![allow(clippy::needless_range_loop)] #![warn(missing_docs)]
pub mod data;
pub mod errors;
pub mod prelude;
pub mod training;
pub mod utils;
#[cfg(feature = "parametric")]
pub mod parametric;
use ann_search_rs::cpu::hnsw::{HnswIndex, HnswState};
use ann_search_rs::cpu::nndescent::{ApplySortedUpdates, NNDescent, NNDescentQuery};
use ann_search_rs::utils::dist::parse_ann_dist;
use faer::MatRef;
use rand_distr::{Distribution, StandardNormal};
use std::{default::Default, time::Instant};
use thousands::*;
#[cfg(feature = "parametric")]
use burn::tensor::{backend::AutodiffBackend, Element};
#[cfg(feature = "parametric")]
use num_traits::ToPrimitive;
#[cfg(feature = "gpu")]
use ann_search_rs::gpu::nndescent_gpu::NNDescentGpu;
#[cfg(feature = "gpu")]
use ann_search_rs::gpu::traits_gpu::AnnSearchGpuFloat;
#[cfg(feature = "gpu")]
use cubecl::prelude::*;
use crate::data::graph::*;
use crate::data::init::*;
use crate::data::nearest_neighbours::*;
use crate::data::pacmap_pairs::*;
use crate::data::structures::*;
use crate::prelude::*;
use crate::training::mds_optimiser::*;
use crate::training::pacmap_optimiser::{
optimise_pacmap, optimise_pacmap_parallel, PacMapOptimiser,
};
use crate::training::tsne_optimiser::*;
use crate::training::umap_optimisers::*;
use crate::utils::diffusions::*;
use crate::utils::math::compute_largest_eigenpairs_lanczos;
use crate::utils::potentials::compute_potential_distances;
use crate::utils::sparse_ops::matrix_power;
#[cfg(feature = "parametric")]
use crate::parametric::model::*;
#[cfg(feature = "parametric")]
use crate::parametric::parametric_train::*;
#[cfg(feature = "fft_tsne")]
use crate::utils::fft::FftwFloat;
pub type PreComputedKnn<T> = Option<(Vec<Vec<usize>>, Vec<Vec<T>>)>;
pub type TsneGraph<T> = Result<(CoordinateList<T>, Vec<Vec<usize>>, Vec<Vec<T>>), ManifoldsError>;
#[derive(Debug, Clone)]
pub struct UmapParams<T> {
n_dim: usize,
k: usize,
optimiser: String,
ann_type: String,
initialisation: String,
init_range: Option<T>,
nn_params: NearestNeighbourParams<T>,
umap_graph_params: UmapGraphParams<T>,
optim_params: UmapOptimParams<T>,
randomised: bool,
}
impl<T> UmapParams<T>
where
T: ManifoldsFloat,
{
#[allow(clippy::too_many_arguments)]
pub fn new(
n_dim: Option<usize>,
k: Option<usize>,
optimiser: Option<String>,
ann_type: Option<String>,
initialisation: Option<String>,
init_range: Option<T>,
nn_params: Option<NearestNeighbourParams<T>>,
optim_params: Option<UmapOptimParams<T>>,
umap_graph_params: Option<UmapGraphParams<T>>,
randomised: Option<bool>,
) -> Self {
let n_dim = n_dim.unwrap_or(2);
let k = k.unwrap_or(15);
let optimiser = optimiser.unwrap_or("adam_parallel".to_string());
let ann_type = ann_type.unwrap_or("kmknn".to_string());
let initialisation = initialisation.unwrap_or("spectral".to_string());
let nn_params = nn_params.unwrap_or_default();
let optim_params = optim_params.unwrap_or_default();
let umap_graph_params = umap_graph_params.unwrap_or_default();
let randomised = randomised.unwrap_or(false);
Self {
n_dim,
k,
optimiser,
ann_type,
initialisation,
init_range,
nn_params,
optim_params,
umap_graph_params,
randomised,
}
}
pub fn default_2d(
n_dim: Option<usize>,
k: Option<usize>,
min_dist: Option<T>,
spread: Option<T>,
) -> Self {
let n_dim = n_dim.unwrap_or(2);
let k = k.unwrap_or(15);
let min_dist = min_dist.unwrap_or(T::from_f64(0.5).unwrap());
let spread = spread.unwrap_or(T::from_f64(1.0).unwrap());
Self {
n_dim,
k,
optimiser: "adam_parallel".into(),
ann_type: "kmknn".into(),
initialisation: "spectral".into(),
init_range: None,
nn_params: NearestNeighbourParams::default(),
optim_params: UmapOptimParams::from_min_dist_spread(
min_dist, spread, None, None, None, None, None, None, None,
),
umap_graph_params: UmapGraphParams::default(),
randomised: false,
}
}
}
#[allow(clippy::too_many_arguments)]
pub fn construct_umap_graph<T>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
k: usize,
ann_type: String,
umap_params: &UmapGraphParams<T>,
nn_params: &NearestNeighbourParams<T>,
n_epochs: usize,
seed: usize,
verbose: bool,
) -> (CoordinateList<T>, Vec<Vec<usize>>, Vec<Vec<T>>)
where
T: ManifoldsFloat,
HnswIndex<T>: HnswState<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
{
let (knn_indices, knn_dist) = match precomputed_knn {
Some((indices, distances)) => {
if verbose {
println!("Using precomputed kNN graph...");
}
(indices, distances)
}
None => {
if verbose {
println!(
"Running approximate nearest neighbour search using {}...",
ann_type
);
}
let start_knn = Instant::now();
let result = run_ann_search(data, k, ann_type, nn_params, seed, verbose);
if verbose {
println!("kNN search done in: {:.2?}.", start_knn.elapsed());
}
result
}
};
if verbose {
println!("Constructing fuzzy simplicial set...");
}
let start_graph = Instant::now();
let (sigma, rho) = smooth_knn_dist(
&knn_dist,
knn_dist[0].len(),
umap_params.local_connectivity,
umap_params.bandwidth,
64,
);
let graph = knn_to_coo(&knn_indices, &knn_dist, &sigma, &rho);
let graph = symmetrise_graph(graph, umap_params.mix_weight);
let graph = filter_weak_edges(graph, n_epochs, verbose);
if verbose {
println!(
"Finalised graph generation in {:.2?}.",
start_graph.elapsed()
);
}
(graph, knn_indices, knn_dist)
}
pub fn umap<T>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
umap_params: &UmapParams<T>,
seed: usize,
verbose: bool,
) -> Result<Vec<Vec<T>>, ManifoldsError>
where
T: ManifoldsFloat,
HnswIndex<T>: HnswState<T>,
StandardNormal: Distribution<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
{
let init_type = parse_initilisation(
&umap_params.initialisation,
umap_params.randomised,
umap_params.init_range,
)
.unwrap_or(EmbdInit::RandomInit { range: None });
let optimiser = parse_umap_optimiser(&umap_params.optimiser).unwrap_or_default();
if verbose {
println!(
"Running umap with alpha: {:.2?} and beta: {:.2?}",
umap_params.optim_params.a, umap_params.optim_params.b
);
}
let (graph, _, _) = construct_umap_graph(
data,
precomputed_knn,
umap_params.k,
umap_params.ann_type.clone(),
&umap_params.umap_graph_params,
&umap_params.nn_params,
umap_params.optim_params.n_epochs,
seed,
verbose,
);
if verbose {
println!(
"Initialising embedding via {} layout...",
umap_params.initialisation
);
}
let start_layout = Instant::now();
let mut embd = initialise_embedding(&init_type, umap_params.n_dim, seed as u64, &graph, data)?;
let graph_adj = coo_to_adjacency_list(&graph);
if verbose {
println!(
"Optimising embedding via {} ({} epochs) on {} edges...",
match optimiser {
UmapOptimiser::Adam => "Adam",
UmapOptimiser::Sgd => "SGD",
UmapOptimiser::AdamParallel => "Adam (multi-threaded)",
},
umap_params.optim_params.n_epochs,
graph.col_indices.len().separate_with_underscores()
);
}
match optimiser {
UmapOptimiser::Adam => optimise_embedding_adam(
&mut embd,
&graph_adj,
&umap_params.optim_params,
seed as u64,
verbose,
),
UmapOptimiser::Sgd => {
optimise_embedding_sgd(
&mut embd,
&graph_adj,
&umap_params.optim_params,
seed as u64,
verbose,
);
}
UmapOptimiser::AdamParallel => {
optimise_embedding_adam_parallel(
&mut embd,
&graph_adj,
&umap_params.optim_params,
seed as u64,
verbose,
);
}
}
let end_layout = start_layout.elapsed();
if verbose {
println!(
"Initialised and optimised embedding in: {:.2?}.",
end_layout
);
println!("UMAP complete!");
}
let n_samples = embd.len();
let mut transposed = vec![vec![T::zero(); n_samples]; umap_params.n_dim];
for sample_idx in 0..n_samples {
for dim_idx in 0..umap_params.n_dim {
transposed[dim_idx][sample_idx] = embd[sample_idx][dim_idx];
}
}
Ok(transposed)
}
#[derive(Debug, Clone)]
pub struct TsneParams<T> {
pub n_dim: usize,
pub perplexity: T,
pub ann_type: String,
pub initialisation: String,
pub init_range: Option<T>,
pub nn_params: NearestNeighbourParams<T>,
pub optim_params: TsneOptimParams<T>,
pub randomised_init: bool,
}
impl<T> TsneParams<T>
where
T: ManifoldsFloat,
{
#[allow(clippy::too_many_arguments)]
pub fn new(
n_dim: Option<usize>,
perplexity: Option<T>,
init_range: Option<T>,
lr: Option<T>,
n_epochs: Option<usize>,
ann_type: Option<String>,
theta: Option<T>,
n_interp_points: Option<usize>,
) -> Self {
let n_dim = n_dim.unwrap_or(2);
let perplexity = perplexity.unwrap_or_else(|| T::from_f64(30.0).unwrap());
let lr = lr.unwrap_or_else(|| T::from_f64(200.0).unwrap());
let n_epochs = n_epochs.unwrap_or(1000);
let ann_type = ann_type.unwrap_or_else(|| "kmknn".to_string());
let theta = theta.unwrap_or_else(|| T::from_f64(0.5).unwrap());
let n_interp_points = n_interp_points.unwrap_or(3);
Self {
n_dim,
perplexity,
ann_type,
initialisation: "pca".to_string(),
init_range,
nn_params: NearestNeighbourParams::default(),
optim_params: TsneOptimParams {
n_epochs,
lr,
early_exag_iter: 250,
early_exag_factor: T::from_f64(12.0).unwrap(),
theta,
n_interp_points,
},
randomised_init: true,
}
}
}
pub fn construct_tsne_graph<T>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
perplexity: T,
ann_type: String,
nn_params: &NearestNeighbourParams<T>,
seed: usize,
verbose: bool,
) -> TsneGraph<T>
where
T: ManifoldsFloat,
HnswIndex<T>: HnswState<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
{
let (knn_indices, knn_dist) = match precomputed_knn {
Some((indices, distances)) => {
if verbose {
println!("Using precomputed kNN graph...");
}
(indices, distances)
}
None => {
let k_float = perplexity * T::from_f64(3.0).unwrap();
let k = k_float.to_usize().unwrap().max(5).min(data.nrows() - 1);
if verbose {
println!("Running kNN search (k={}) using {}...", k, ann_type);
}
let start_knn = Instant::now();
let result = run_ann_search(data, k, ann_type, nn_params, seed, verbose);
if verbose {
println!("kNN search done in: {:.2?}.", start_knn.elapsed());
}
result
}
};
if verbose {
println!("Computing Gaussian affinities and symmetrising...");
}
let start_graph = Instant::now();
let directed_graph = gaussian_knn_affinities(
&knn_indices,
&knn_dist,
perplexity,
T::from_f64(1e-5).unwrap(),
200,
nn_params.dist_metric == "euclidean",
)?;
let graph = symmetrise_affinities_tsne(directed_graph);
if verbose {
println!(
"Finalised graph generation in {:.2?}.",
start_graph.elapsed()
);
}
Ok((graph, knn_indices, knn_dist))
}
#[cfg(feature = "fft_tsne")]
pub fn tsne<T>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
params: &TsneParams<T>,
approx_type: &str,
seed: usize,
verbose: bool,
) -> Result<Vec<Vec<T>>, ManifoldsError>
where
T: ManifoldsFloat + FftwFloat,
HnswIndex<T>: HnswState<T>,
StandardNormal: Distribution<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
{
if params.n_dim != 2 {
return Err(ManifoldsError::IncorrectDim {
n_dim: params.n_dim,
});
}
let (graph, _, _) = construct_tsne_graph(
data,
precomputed_knn,
params.perplexity,
params.ann_type.clone(),
¶ms.nn_params,
seed,
verbose,
)?;
if verbose {
println!("Initialising embedding via {}...", ¶ms.initialisation);
}
let init_type = parse_initilisation(
¶ms.initialisation,
params.randomised_init,
params.init_range,
)
.unwrap_or(EmbdInit::PcaInit {
randomised: true,
range: Some(T::from_f64(1e-2).unwrap()),
});
let mut embd = initialise_embedding(&init_type, params.n_dim, seed as u64, &graph, data)?;
let tsne_approx = parse_tsne_optimiser(approx_type).unwrap_or_default();
let start_optim = Instant::now();
match tsne_approx {
TsneOpt::BarnesHut => {
if verbose {
println!(
"Optimising via Barnes-Hut t-SNE ({} epochs)...",
params.optim_params.n_epochs
);
}
optimise_bh_tsne(&mut embd, ¶ms.optim_params, &graph, verbose);
}
#[cfg(feature = "fft_tsne")]
TsneOpt::Fft => {
if verbose {
println!(
"Optimising via FFT Interpolation-based t-SNE ({} epochs)...",
params.optim_params.n_epochs
);
}
optimise_fft_tsne(&mut embd, ¶ms.optim_params, &graph, verbose);
}
}
if verbose {
println!("Optimisation complete in {:.2?}.", start_optim.elapsed());
}
let n_samples = embd.len();
let mut transposed = vec![vec![T::zero(); n_samples]; params.n_dim];
for sample_idx in 0..n_samples {
for dim_idx in 0..params.n_dim {
transposed[dim_idx][sample_idx] = embd[sample_idx][dim_idx];
}
}
Ok(transposed)
}
#[cfg(not(feature = "fft_tsne"))]
pub fn tsne<T>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
params: &TsneParams<T>,
approx_type: &str,
seed: usize,
verbose: bool,
) -> Result<Vec<Vec<T>>, ManifoldsError>
where
T: ManifoldsFloat,
HnswIndex<T>: HnswState<T>,
StandardNormal: Distribution<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
{
if params.n_dim != 2 {
return Err(ManifoldsError::IncorrectDim {
n_dim: params.n_dim,
});
}
let (graph, _, _) = construct_tsne_graph(
data,
precomputed_knn,
params.perplexity,
params.ann_type.clone(),
¶ms.nn_params,
seed,
verbose,
)?;
if verbose {
println!("Initialising embedding via {}...", ¶ms.initialisation);
}
let init_type = parse_initilisation(
¶ms.initialisation,
params.randomised_init,
params.init_range,
)
.unwrap_or(EmbdInit::PcaInit {
randomised: false,
range: Some(T::from_f64(1e-2).unwrap()),
});
let mut embd = initialise_embedding(&init_type, params.n_dim, seed as u64, &graph, data)?;
let tsne_approx = parse_tsne_optimiser(approx_type).unwrap_or_default();
let start_optim = Instant::now();
match tsne_approx {
TsneOpt::BarnesHut => {
if verbose {
println!(
"Optimising via Barnes-Hut t-SNE ({} epochs)...",
params.optim_params.n_epochs
);
}
optimise_bh_tsne(&mut embd, ¶ms.optim_params, &graph, verbose);
}
#[cfg(not(feature = "fft_tsne"))]
TsneOpt::Fft => {
panic!("FFT-accelerated t-SNE not available. Recompile with 'fft_tsne' feature or use 'barnes_hut' approximation.");
}
}
if verbose {
println!("Optimisation complete in {:.2?}.", start_optim.elapsed());
}
let n_samples = embd.len();
let mut transposed = vec![vec![T::zero(); n_samples]; params.n_dim];
for sample_idx in 0..n_samples {
for dim_idx in 0..params.n_dim {
transposed[dim_idx][sample_idx] = embd[sample_idx][dim_idx];
}
}
Ok(transposed)
}
#[derive(Debug, Clone)]
pub struct PhateParams<T> {
pub n_dim: usize,
pub k: usize,
pub ann_type: String,
pub ann_params: NearestNeighbourParams<T>,
pub diffusion_params: PhateDiffusionParams<T>,
pub mds_method: String,
pub mds_iter: Option<usize>,
pub randomised: bool,
}
impl<T> PhateParams<T>
where
T: ManifoldsFloat,
{
#[allow(clippy::too_many_arguments)]
pub fn new(
n_dim: Option<usize>,
k: Option<usize>,
ann_type: Option<String>,
decay: Option<T>,
bandwidth_scale: Option<T>,
graph_symmetry: Option<String>,
t_max: Option<usize>,
gamma: Option<T>,
n_landmarks: Option<usize>,
landmark_method: Option<String>,
n_svd: Option<usize>,
t_custom: Option<usize>,
mds_method: Option<String>,
mds_iter: Option<usize>,
randomised: Option<bool>,
) -> Self {
let phate_diffusion_params = PhateDiffusionParams::new(
Some(decay.unwrap_or_else(|| T::from_f64(40.0).unwrap())),
bandwidth_scale.unwrap_or_else(|| T::from_f64(1.0).unwrap()),
T::from_f64(1e-4).unwrap(),
graph_symmetry.unwrap_or("average".to_string()),
n_landmarks,
landmark_method.unwrap_or("spectral".to_string()),
n_svd,
t_max,
t_custom,
gamma.unwrap_or_else(|| T::from_f64(1.0).unwrap()),
);
Self {
n_dim: n_dim.unwrap_or(2),
k: k.unwrap_or(5),
ann_type: ann_type.unwrap_or_else(|| "kmknn".to_string()),
ann_params: NearestNeighbourParams::default(),
diffusion_params: phate_diffusion_params,
mds_method: mds_method.unwrap_or_else(|| "sgd_dense".to_string()),
mds_iter,
randomised: randomised.unwrap_or(true),
}
}
}
#[allow(clippy::too_many_arguments)]
pub fn construct_phate_diffusion<T>(
data: MatRef<T>,
k: usize,
precomputed_knn: PreComputedKnn<T>,
ann_type: &str,
nn_params: &NearestNeighbourParams<T>,
phate_params: &PhateParams<T>,
seed: usize,
verbose: bool,
) -> Result<PhateDiffusion<T>, ManifoldsError>
where
T: ManifoldsFloat,
HnswIndex<T>: HnswState<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
{
let (knn_indices, knn_dist) = match precomputed_knn {
Some((indices, distances)) => {
if verbose {
println!("Using precomputed kNN graph...");
}
(indices, distances)
}
None => {
if verbose {
println!(
"Running approximate nearest neighbour search using {}...",
ann_type
);
}
let start_knn = Instant::now();
let result = run_ann_search(data, k, ann_type.to_string(), nn_params, seed, verbose);
if verbose {
println!("kNN search done in: {:.2?}.", start_knn.elapsed());
}
result
}
};
if verbose {
println!("Calculating alpha decay affinities");
}
let start_alpha_affinities = Instant::now();
let graph = phate_alpha_decay_affinities(
&knn_indices,
&knn_dist,
phate_params.k,
phate_params.diffusion_params.decay,
phate_params.diffusion_params.bandwidth_scale,
phate_params.diffusion_params.thresh,
&phate_params.diffusion_params.graph_symmetry,
nn_params.dist_metric == "euclidean",
);
if verbose {
println!(
"Alpha decay affinity calculations done in: {:.2?}.",
start_alpha_affinities.elapsed()
);
}
let affinity = coo_to_csr(&graph);
let diffusion_op = build_diffusion_operator(&affinity);
match phate_params.diffusion_params.n_landmarks {
None => Ok(PhateDiffusion::Full {
operator: diffusion_op,
}),
Some(n_landmarks) if n_landmarks >= data.nrows() => Ok(PhateDiffusion::Full {
operator: diffusion_op,
}),
Some(n_landmarks) => {
if verbose {
println!(" Building {} landmarks...", n_landmarks);
}
let start_landmarks = Instant::now();
let landmarks = PhateLandmarks::build(
data,
&affinity,
&diffusion_op,
n_landmarks,
&phate_params.diffusion_params.landmark_method,
&nn_params.dist_metric,
seed,
Some(100),
verbose,
)?;
if verbose {
println!(
" Landmarks generated in: {:.2?}.",
start_landmarks.elapsed()
);
}
Ok(PhateDiffusion::Landmark { landmarks })
}
}
}
pub fn phate<T>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
phate_params: PhateParams<T>,
seed: usize,
verbose: bool,
) -> Result<Vec<Vec<T>>, ManifoldsError>
where
T: ManifoldsFloat,
HnswIndex<T>: HnswState<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
StandardNormal: Distribution<T>,
{
let start_phate = Instant::now();
let phate_diffusion = construct_phate_diffusion(
data,
phate_params.k,
precomputed_knn,
&phate_params.ann_type,
&phate_params.ann_params,
&phate_params,
seed,
verbose,
)?;
let start_t = Instant::now();
let t = match phate_params.diffusion_params.t {
PhateTime::Auto { t_max } => {
if verbose {
println!("Finding optimal t (t_max={})...", t_max);
}
match &phate_diffusion {
PhateDiffusion::Landmark { landmarks } => landmarks.find_optimal_t(t_max),
PhateDiffusion::Full { operator } => {
let entropy = landmark_von_neumann_entropy(operator, t_max)?;
Ok(find_knee_point(&entropy))
}
}
}
PhateTime::Fixed(t) => Ok(t),
}?;
if verbose {
println!("Identified t = {} in {:.2?}.", t, start_t.elapsed());
}
let mds_method = parse_mds_method(&phate_params.mds_method).unwrap_or_default();
let dist = parse_ann_dist(&phate_params.ann_params.dist_metric).unwrap_or_default();
let mds_params = MdsOptimParams::new(
data.nrows(),
phate_params.randomised,
phate_params.mds_iter,
None,
);
let start_embed = Instant::now();
let embedding = match phate_diffusion {
PhateDiffusion::Full { operator } => {
if verbose {
println!("Powering diffusion operator...");
}
let powered = matrix_power(&operator, t);
let potential = calculate_potential(&powered, 1, phate_params.diffusion_params.gamma);
if verbose {
println!(
"Potential shape: {} × {} - calculated in {:.2?}.",
potential.shape().0,
potential.shape().1,
start_embed.elapsed()
);
}
let res = match mds_method {
MdsMethod::ClassicMds => {
if verbose {
println!("Computing pairwise distances, running classic MDS...");
}
let distances = compute_potential_distances(&potential, &dist);
classic_mds(&distances, phate_params.n_dim, mds_params.randomised, seed)
}
MdsMethod::SgdDense => {
if verbose {
println!("Computing pairwise distances, running SGD-MDS...");
}
let distances = compute_potential_distances(&potential, &dist);
sgd_mds(
&distances,
phate_params.n_dim,
&mds_params,
None,
seed,
verbose,
)
}
}?;
res
}
PhateDiffusion::Landmark { landmarks } => {
if verbose {
println!(
"Powering landmark operator ({} landmarks)...",
landmarks.get_n_landmarks()
);
}
let landmark_powered = landmarks.power(t);
let landmark_potential =
calculate_potential(&landmark_powered, 1, phate_params.diffusion_params.gamma);
if verbose {
println!(
"Landmark potential shape: {} × {} - calculated in {:.2?}.",
landmark_potential.shape().0,
landmark_potential.shape().1,
start_embed.elapsed()
);
println!("Computing landmark pairwise distances...");
}
let landmark_distances = compute_potential_distances(&landmark_potential, &dist);
let landmark_mds_params = MdsOptimParams::new(
landmarks.get_n_landmarks(),
phate_params.randomised,
None,
None,
);
if verbose {
println!("Running MDS on landmarks...");
}
let landmark_embedding = match mds_method {
MdsMethod::ClassicMds => classic_mds(
&landmark_distances,
phate_params.n_dim,
landmark_mds_params.randomised,
seed,
),
_ => sgd_mds(
&landmark_distances,
phate_params.n_dim,
&landmark_mds_params,
None,
seed,
verbose,
),
}?;
if verbose {
println!("Interpolating to full N points via Nyström...");
}
landmarks.interpolate_embedding(&landmark_embedding)
}
};
if verbose {
println!("Ran MDS in {:.2?}.", start_embed.elapsed());
println!("Finished running PHATE in {:.2?}.", start_phate.elapsed());
}
let n_samples = embedding.len();
let mut transposed = vec![vec![T::zero(); n_samples]; phate_params.n_dim];
for i in 0..n_samples {
for d in 0..phate_params.n_dim {
transposed[d][i] = embedding[i][d];
}
}
Ok(transposed)
}
#[derive(Debug, Clone)]
pub struct PacmapParams<T> {
pub n_dim: usize,
pub k: usize,
pub ann_type: String,
pub optimiser_type: String,
pub n_mid_near: usize,
pub n_further: usize,
pub mn_candidate_start: usize,
pub mn_candidate_end: usize,
pub initialisation: String,
pub nn_params: NearestNeighbourParams<T>,
pub optim_params: PacmapOptimParams<T>,
}
impl<T> PacmapParams<T>
where
T: ManifoldsFloat,
{
#[allow(clippy::too_many_arguments)]
pub fn new(
n_dim: Option<usize>,
k: Option<usize>,
ann_type: Option<String>,
optimiser_type: Option<String>,
n_mid_near: Option<usize>,
n_further: Option<usize>,
mn_candidate_start: Option<usize>,
mn_candidate_end: Option<usize>,
initialisation: Option<String>,
nn_params: Option<NearestNeighbourParams<T>>,
optim_params: Option<PacmapOptimParams<T>>,
) -> Self {
let mn_candidate_end = mn_candidate_end.unwrap_or(50);
let k = k.unwrap_or(10).max(mn_candidate_end);
Self {
n_dim: n_dim.unwrap_or(2),
k,
ann_type: ann_type.unwrap_or("kmknn".to_string()),
optimiser_type: optimiser_type.unwrap_or("adam_parallel".to_string()),
n_mid_near: n_mid_near.unwrap_or(2),
n_further: n_further.unwrap_or(2),
mn_candidate_start: mn_candidate_start.unwrap_or(4),
mn_candidate_end,
initialisation: initialisation.unwrap_or("pca".to_string()),
nn_params: nn_params.unwrap_or_default(),
optim_params: optim_params.unwrap_or_default(),
}
}
}
impl<T> Default for PacmapParams<T>
where
T: ManifoldsFloat,
{
fn default() -> Self {
Self::new(
None, None, None, None, None, None, None, None, None, None, None,
)
}
}
pub fn pacmap<T>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
params_pacmap: &PacmapParams<T>,
seed: usize,
verbose: bool,
) -> Result<Vec<Vec<T>>, ManifoldsError>
where
T: ManifoldsFloat,
HnswIndex<T>: HnswState<T>,
StandardNormal: Distribution<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
{
let n_samples = data.nrows();
let (knn_indices, _) = match precomputed_knn {
Some((indices, distances)) => {
if verbose {
println!("Using precomputed kNN graph...");
}
(indices, distances)
}
None => {
if verbose {
println!(
"Running approximate nearest neighbour search using {} (k={})...",
params_pacmap.ann_type, params_pacmap.k
);
}
let start_knn = Instant::now();
let result = run_ann_search(
data,
params_pacmap.k,
params_pacmap.ann_type.clone(),
¶ms_pacmap.nn_params,
seed,
verbose,
);
if verbose {
println!("kNN search done in: {:.2?}.", start_knn.elapsed());
}
result
}
};
if verbose {
println!("Constructing PaCMAP pairs...");
}
let start_pairs = Instant::now();
let pairs: PacmapPairs = construct_pacmap_pairs(
&knn_indices,
params_pacmap.n_mid_near,
params_pacmap.n_further,
params_pacmap.mn_candidate_start,
params_pacmap.mn_candidate_end,
seed as u64,
);
let end_pairs = start_pairs.elapsed();
if verbose {
println!(
"Pairs generated in {:.2?}: {} near, {} mid-near, {} further.",
end_pairs,
pairs.near.len().separate_with_underscores(),
pairs.mid_near.len().separate_with_underscores(),
pairs.further.len().separate_with_underscores()
);
}
let init_type = parse_initilisation(¶ms_pacmap.initialisation, true, None).unwrap_or(
EmbdInit::PcaInit {
randomised: true,
range: None,
},
);
if verbose {
println!(
"Initialising embedding via {} layout...",
params_pacmap.initialisation
);
}
let dummy_graph = knn_to_coo_unweighted(&knn_indices);
let mut embd = initialise_embedding(
&init_type,
params_pacmap.n_dim,
seed as u64,
&dummy_graph,
data,
)?;
let optimiser = parse_pacmap_optimiser(¶ms_pacmap.optimiser_type).unwrap_or_default();
let start_optim = Instant::now();
match optimiser {
PacMapOptimiser::AdamParallel => {
optimise_pacmap_parallel(&mut embd, &pairs, ¶ms_pacmap.optim_params, verbose);
}
PacMapOptimiser::Adam => {
optimise_pacmap(&mut embd, &pairs, ¶ms_pacmap.optim_params, verbose);
}
}
let end_optim = start_optim.elapsed();
if verbose {
println!("Optimisation done in {:.2?}.", end_optim);
println!("PaCMAP complete!");
}
let mut transposed = vec![vec![T::zero(); n_samples]; params_pacmap.n_dim];
for sample_idx in 0..n_samples {
for dim_idx in 0..params_pacmap.n_dim {
transposed[dim_idx][sample_idx] = embd[sample_idx][dim_idx];
}
}
Ok(transposed)
}
#[derive(Debug, Clone)]
pub struct DiffusionMapsParams<T> {
pub n_dim: usize,
pub k: usize,
pub ann_type: String,
pub ann_params: NearestNeighbourParams<T>,
pub bandwidth_scale: T,
pub thresh: T,
pub graph_symmetry: String,
pub alpha_norm: T,
pub t: PhateTime,
pub n_landmarks: Option<usize>,
pub landmark_method: String,
pub n_svd: Option<usize>,
}
impl<T> DiffusionMapsParams<T>
where
T: ManifoldsFloat,
{
#[allow(clippy::too_many_arguments)]
pub fn new(
n_dim: Option<usize>,
k: Option<usize>,
ann_type: Option<String>,
bandwidth_scale: Option<T>,
thresh: Option<T>,
graph_symmetry: Option<String>,
alpha_norm: Option<T>,
t_max: Option<usize>,
t_custom: Option<usize>,
n_landmarks: Option<usize>,
landmark_method: Option<String>,
n_svd: Option<usize>,
) -> Self {
let t_max = t_max.unwrap_or(PHATE_MAX_T);
let t = parse_phate_time(t_custom, t_max);
Self {
n_dim: n_dim.unwrap_or(2),
k: k.unwrap_or(5),
ann_type: ann_type.unwrap_or_else(|| "kmknn".to_string()),
ann_params: NearestNeighbourParams::default(),
bandwidth_scale: bandwidth_scale.unwrap_or_else(|| T::from_f64(1.0).unwrap()),
thresh: thresh.unwrap_or_else(|| T::from_f64(1e-4).unwrap()),
graph_symmetry: graph_symmetry.unwrap_or_else(|| "add".to_string()),
alpha_norm: alpha_norm.unwrap_or_else(|| T::from_f64(1.0).unwrap()),
t,
n_landmarks,
landmark_method: landmark_method.unwrap_or_else(|| "spectral".to_string()),
n_svd,
}
}
}
pub enum DiffusionMapsOperator<T>
where
T: ManifoldsFloat,
{
Full {
p_sym: CompressedSparseData<T>,
sqrt_degrees: Vec<T>,
},
Landmark {
landmarks: DiffusionMapsLandmarks<T>,
},
}
#[allow(clippy::too_many_arguments)]
pub fn construct_diffusion_maps_operator<T>(
data: MatRef<T>,
k: usize,
precomputed_knn: PreComputedKnn<T>,
ann_type: &str,
nn_params: &NearestNeighbourParams<T>,
dm_params: &DiffusionMapsParams<T>,
seed: usize,
verbose: bool,
) -> Result<DiffusionMapsOperator<T>, ManifoldsError>
where
T: ManifoldsFloat,
HnswIndex<T>: HnswState<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
{
let use_full = match dm_params.n_landmarks {
None => true,
Some(n) => n >= data.nrows(),
};
let needs_affinity =
use_full || matches!(dm_params.landmark_method.as_str(), "spectral" | "density");
let affinity = if needs_affinity {
let (knn_indices, knn_dist) = match precomputed_knn {
Some((indices, distances)) => {
if verbose {
println!("Using precomputed kNN graph...");
}
(indices, distances)
}
None => {
if verbose {
println!("Running ANN search using {}...", ann_type);
}
let start_knn = Instant::now();
let res = run_ann_search(data, k, ann_type.to_string(), nn_params, seed, verbose);
if verbose {
println!("kNN search done in {:.2?}.", start_knn.elapsed());
}
res
}
};
if verbose {
println!("Building Gaussian kernel affinities");
}
let graph = phate_alpha_decay_affinities(
&knn_indices,
&knn_dist,
dm_params.k,
Some(T::from_f64(2.0).unwrap()),
dm_params.bandwidth_scale,
dm_params.thresh,
&dm_params.graph_symmetry,
nn_params.dist_metric == "euclidean",
);
Some(coo_to_csr(&graph))
} else {
if verbose {
println!("Skipping full affinity (random landmarks).");
}
None
};
if use_full {
let kernel = affinity.unwrap();
let kernel_norm = apply_anisotropic_normalisation(&kernel, dm_params.alpha_norm);
let (p_sym, sqrt_degrees) = build_symmetric_diffusion_operator(&kernel_norm);
return Ok(DiffusionMapsOperator::Full {
p_sym,
sqrt_degrees,
});
}
let n_landmarks = dm_params.n_landmarks.unwrap();
if verbose {
println!(" Building {} landmarks...", n_landmarks);
}
let start_l = Instant::now();
let landmarks = DiffusionMapsLandmarks::build(
data,
affinity.as_ref(),
n_landmarks,
&dm_params.landmark_method,
&nn_params.dist_metric,
dm_params.alpha_norm,
dm_params.k,
dm_params.bandwidth_scale,
dm_params.thresh,
&dm_params.graph_symmetry,
seed,
dm_params.n_svd,
verbose,
)?;
if verbose {
println!(" Landmarks built in {:.2?}.", start_l.elapsed());
}
Ok(DiffusionMapsOperator::Landmark { landmarks })
}
pub fn diffusion_maps<T>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
dm_params: DiffusionMapsParams<T>,
seed: usize,
verbose: bool,
) -> Result<Vec<Vec<T>>, ManifoldsError>
where
T: ManifoldsFloat,
HnswIndex<T>: HnswState<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
{
let start_dm = Instant::now();
let op = construct_diffusion_maps_operator(
data,
dm_params.k,
precomputed_knn,
&dm_params.ann_type,
&dm_params.ann_params,
&dm_params,
seed,
verbose,
)?;
let embedding: Vec<Vec<T>> = match op {
DiffusionMapsOperator::Full {
p_sym,
sqrt_degrees,
} => {
let t = match dm_params.t {
PhateTime::Auto { t_max } => {
if verbose {
println!("Finding optimal t (t_max={})...", t_max);
}
let entropy = landmark_von_neumann_entropy(&p_sym, t_max)?;
find_knee_point(&entropy)
}
PhateTime::Fixed(t) => t,
};
if verbose {
println!("Using t = {}.", t);
println!(
"Computing top {} eigenpairs of P_sym...",
dm_params.n_dim + 1
);
}
let start_eig = Instant::now();
let n_ask = (dm_params.n_dim + 5).min(sqrt_degrees.len() - 1);
let (evals, evecs) = compute_largest_eigenpairs_lanczos(&p_sym, n_ask, seed as u64)?;
if verbose {
println!("Eigendecomposition done in {:.2?}.", start_eig.elapsed());
}
let n = sqrt_degrees.len();
let mut embedding = vec![vec![T::zero(); dm_params.n_dim]; n];
for comp_idx in 1..=dm_params.n_dim {
let lambda = T::from_f32(evals[comp_idx]).unwrap();
let lambda_t = lambda.powi(t as i32);
let mut max_abs = 0.0f32;
let mut sign = 1.0f32;
for i in 0..n {
let v = evecs[i][comp_idx];
if v.abs() > max_abs {
max_abs = v.abs();
sign = if v >= 0.0 { 1.0 } else { -1.0 };
}
}
for i in 0..n {
let u = T::from_f32(evecs[i][comp_idx] * sign).unwrap();
embedding[i][comp_idx - 1] = lambda_t * u / sqrt_degrees[i];
}
}
embedding
}
DiffusionMapsOperator::Landmark { landmarks } => {
let t = match dm_params.t {
PhateTime::Auto { t_max } => {
if verbose {
println!("Finding optimal t on landmarks (t_max={})...", t_max);
}
landmarks.find_optimal_t(t_max)?
}
PhateTime::Fixed(t) => t,
};
if verbose {
println!(
"Using t = {}. Eigendecomposing {}x{} landmark operator...",
t,
landmarks.get_n_landmarks(),
landmarks.get_n_landmarks()
);
}
let start_eig = Instant::now();
let (evals, evecs) = landmarks.eigendecompose(dm_params.n_dim, seed as u64)?;
if verbose {
println!("Eigendecomposition done in {:.2?}.", start_eig.elapsed());
println!("Nystroem-extending to full data...");
}
let (landmark_embedding, lambdas) =
landmarks.compute_landmark_embedding(&evals, &evecs, dm_params.n_dim, t);
landmarks.nystrom_extend(&landmark_embedding, &lambdas)
}
};
if verbose {
println!("Diffusion maps finished in {:.2?}.", start_dm.elapsed());
}
let n = embedding.len();
let mut transposed = vec![vec![T::zero(); n]; dm_params.n_dim];
for i in 0..n {
for d in 0..dm_params.n_dim {
transposed[d][i] = embedding[i][d];
}
}
Ok(transposed)
}
#[cfg(feature = "parametric")]
#[derive(Debug, Clone)]
pub struct ParametricUmapParams<T> {
pub n_dim: usize,
pub k: usize,
pub ann_type: String,
pub hidden_layers: Vec<usize>,
pub nn_params: NearestNeighbourParams<T>,
pub umap_graph_params: UmapGraphParams<T>,
pub train_param: TrainParametricParams<T>,
}
#[cfg(feature = "parametric")]
impl<T> ParametricUmapParams<T>
where
T: ManifoldsFloat + Element,
{
pub fn new(
n_dim: Option<usize>,
k: Option<usize>,
ann_type: Option<String>,
hidden_layers: Option<Vec<usize>>,
nn_params: Option<NearestNeighbourParams<T>>,
umap_graph_params: Option<UmapGraphParams<T>>,
train_param: Option<TrainParametricParams<T>>,
) -> Self {
let n_dim = n_dim.unwrap_or(2);
let k = k.unwrap_or(15);
let ann_type = ann_type.unwrap_or("hnsw".to_string());
let hidden_layers = hidden_layers.unwrap_or(vec![128, 128, 128]);
let nn_params = nn_params.unwrap_or_default();
let umap_graph_params = umap_graph_params.unwrap_or_default();
let train_param = train_param.unwrap_or_default();
Self {
n_dim,
k,
ann_type,
hidden_layers,
nn_params,
umap_graph_params,
train_param,
}
}
pub fn default_2d(
n_dim: Option<usize>,
k: Option<usize>,
min_dist: Option<T>,
spread: Option<T>,
corr_weight: Option<T>,
) -> Self {
let n_dim = n_dim.unwrap_or(2);
let k = k.unwrap_or(15);
let min_dist = min_dist.unwrap_or(T::from_f64(0.1).unwrap());
let spread = spread.unwrap_or(T::from_f64(1.0).unwrap());
let corr_weight = corr_weight.unwrap_or(T::from_f64(0.0).unwrap());
Self {
n_dim,
k,
ann_type: "hnsw".to_string(),
hidden_layers: vec![128, 128, 128],
nn_params: NearestNeighbourParams::default(),
umap_graph_params: UmapGraphParams::default(),
train_param: TrainParametricParams::from_min_dist_spread(
min_dist,
spread,
corr_weight,
None,
None,
None,
None,
),
}
}
}
#[cfg(feature = "parametric")]
pub fn parametric_umap<T, B>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
umap_params: &ParametricUmapParams<T>,
device: &B::Device,
seed: usize,
verbose: bool,
) -> Vec<Vec<T>>
where
T: ManifoldsFloat + Element,
B: AutodiffBackend,
HnswIndex<T>: HnswState<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
{
let nn_params = umap_params.nn_params.clone();
if verbose {
println!(
"Running parametric umap with alpha: {:.2?} and beta: {:.2?}",
ToPrimitive::to_f32(&umap_params.train_param.a).unwrap(),
ToPrimitive::to_f32(&umap_params.train_param.b).unwrap()
);
}
let (graph, _, _) = construct_umap_graph(
data,
precomputed_knn,
umap_params.k,
umap_params.ann_type.clone(),
&umap_params.umap_graph_params,
&nn_params,
umap_params.train_param.n_epochs,
seed,
verbose,
);
let model_params = UmapMlpConfig::from_params(
data.ncols(),
umap_params.hidden_layers.clone(),
umap_params.n_dim,
);
let (embd, _) = train_parametric_umap::<B, T>(
data,
graph,
&model_params,
&umap_params.train_param,
device,
seed,
verbose,
);
embd
}
#[cfg(feature = "parametric")]
pub fn train_parametric_umap_model<T, B>(
data: MatRef<T>,
umap_params: &ParametricUmapParams<T>,
device: &B::Device,
seed: usize,
verbose: bool,
) -> (Vec<Vec<T>>, TrainedUmapModel<B, T>)
where
T: ManifoldsFloat + Element,
B: AutodiffBackend,
HnswIndex<T>: HnswState<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
{
let nn_params = umap_params.nn_params.clone();
if verbose {
println!(
"Training parametric umap model with alpha: {:.2?} and beta: {:.2?}",
ToPrimitive::to_f32(&umap_params.train_param.a).unwrap(),
ToPrimitive::to_f32(&umap_params.train_param.b).unwrap()
);
}
let (graph, _, _) = construct_umap_graph(
data,
None,
umap_params.k,
umap_params.ann_type.clone(),
&umap_params.umap_graph_params,
&nn_params,
umap_params.train_param.n_epochs,
seed,
verbose,
);
let model_params = UmapMlpConfig::from_params(
data.ncols(),
umap_params.hidden_layers.clone(),
umap_params.n_dim,
);
let (embd, trained_model) = train_parametric_umap::<B, T>(
data,
graph,
&model_params,
&umap_params.train_param,
device,
seed,
verbose,
);
(embd, trained_model)
}
#[cfg(feature = "gpu")]
#[derive(Debug, Clone)]
pub struct UmapParamsGpu<T> {
pub n_dim: usize,
pub k: usize,
pub optimiser: String,
pub ann_type: String,
pub initialisation: String,
pub init_range: Option<T>,
pub nn_params: NearestNeighbourParamsGpu<T>,
pub umap_graph_params: UmapGraphParams<T>,
pub optim_params: UmapOptimParams<T>,
pub randomised: bool,
}
#[cfg(feature = "gpu")]
impl<T> UmapParamsGpu<T>
where
T: ManifoldsFloat,
{
#[allow(clippy::too_many_arguments)]
pub fn new(
n_dim: Option<usize>,
k: Option<usize>,
optimiser: Option<String>,
ann_type: Option<String>,
initialisation: Option<String>,
init_range: Option<T>,
nn_params: Option<NearestNeighbourParamsGpu<T>>,
optim_params: Option<UmapOptimParams<T>>,
umap_graph_params: Option<UmapGraphParams<T>>,
randomised: Option<bool>,
) -> Self {
Self {
n_dim: n_dim.unwrap_or(2),
k: k.unwrap_or(15),
optimiser: optimiser.unwrap_or("adam_parallel".to_string()),
ann_type: ann_type.unwrap_or("ivf_gpu".to_string()),
initialisation: initialisation.unwrap_or("spectral".to_string()),
init_range,
nn_params: nn_params.unwrap_or_default(),
optim_params: optim_params.unwrap_or_default(),
umap_graph_params: umap_graph_params.unwrap_or_default(),
randomised: randomised.unwrap_or(false),
}
}
pub fn default_2d(
n_dim: Option<usize>,
k: Option<usize>,
min_dist: Option<T>,
spread: Option<T>,
) -> Self {
let min_dist = min_dist.unwrap_or(T::from_f64(0.5).unwrap());
let spread = spread.unwrap_or(T::from_f64(1.0).unwrap());
Self {
n_dim: n_dim.unwrap_or(2),
k: k.unwrap_or(15),
optimiser: "adam_parallel".into(),
ann_type: "ivf_gpu".into(),
initialisation: "spectral".into(),
init_range: None,
nn_params: NearestNeighbourParamsGpu::default(),
optim_params: UmapOptimParams::from_min_dist_spread(
min_dist, spread, None, None, None, None, None, None, None,
),
umap_graph_params: UmapGraphParams::default(),
randomised: false,
}
}
}
#[cfg(feature = "gpu")]
#[allow(clippy::too_many_arguments)]
pub fn construct_umap_graph_gpu<T, R>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
k: usize,
ann_type: String,
umap_params: &UmapGraphParams<T>,
nn_params: &NearestNeighbourParamsGpu<T>,
n_epochs: usize,
device: R::Device,
seed: usize,
verbose: bool,
) -> (CoordinateList<T>, Vec<Vec<usize>>, Vec<Vec<T>>)
where
T: ManifoldsFloat + AnnSearchGpuFloat,
R: Runtime,
NNDescentGpu<T, R>: NNDescentQuery<T>,
{
let (knn_indices, knn_dist) = match precomputed_knn {
Some((indices, distances)) => {
if verbose {
println!("Using precomputed kNN graph...");
}
(indices, distances)
}
None => {
if verbose {
println!("Running GPU nearest neighbour search using {}...", ann_type);
}
let start_knn = Instant::now();
let result =
run_ann_search_gpu::<T, R>(data, k, ann_type, nn_params, device, seed, verbose);
if verbose {
println!("GPU kNN search done in: {:.2?}.", start_knn.elapsed());
}
result
}
};
if verbose {
println!("Constructing fuzzy simplicial set...");
}
let start_graph = Instant::now();
let (sigma, rho) = smooth_knn_dist(
&knn_dist,
knn_dist[0].len(),
umap_params.local_connectivity,
umap_params.bandwidth,
64,
);
let graph = knn_to_coo(&knn_indices, &knn_dist, &sigma, &rho);
let graph = symmetrise_graph(graph, umap_params.mix_weight);
let graph = filter_weak_edges(graph, n_epochs, verbose);
if verbose {
println!(
"Finalised graph generation in {:.2?}.",
start_graph.elapsed()
);
}
(graph, knn_indices, knn_dist)
}
#[cfg(feature = "gpu")]
pub fn umap_gpu<T, R>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
umap_params: &UmapParamsGpu<T>,
device: R::Device,
seed: usize,
verbose: bool,
) -> Result<Vec<Vec<T>>, ManifoldsError>
where
T: ManifoldsFloat + AnnSearchGpuFloat,
R: Runtime,
StandardNormal: Distribution<T>,
NNDescentGpu<T, R>: NNDescentQuery<T>,
{
let init_type = parse_initilisation(
&umap_params.initialisation,
umap_params.randomised,
umap_params.init_range,
)
.unwrap_or(EmbdInit::RandomInit { range: None });
let optimiser = parse_umap_optimiser(&umap_params.optimiser).unwrap_or_default();
if verbose {
println!(
"Running umap (GPU kNN) with alpha: {:.2?} and beta: {:.2?}",
umap_params.optim_params.a, umap_params.optim_params.b
);
}
let (graph, _, _) = construct_umap_graph_gpu::<T, R>(
data,
precomputed_knn,
umap_params.k,
umap_params.ann_type.clone(),
&umap_params.umap_graph_params,
&umap_params.nn_params,
umap_params.optim_params.n_epochs,
device,
seed,
verbose,
);
if verbose {
println!(
"Initialising embedding via {} layout...",
umap_params.initialisation
);
}
let start_layout = Instant::now();
let mut embd = initialise_embedding(&init_type, umap_params.n_dim, seed as u64, &graph, data)?;
let graph_adj = coo_to_adjacency_list(&graph);
if verbose {
println!(
"Optimising embedding via {} ({} epochs) on {} edges...",
match optimiser {
UmapOptimiser::Adam => "Adam",
UmapOptimiser::Sgd => "SGD",
UmapOptimiser::AdamParallel => "Adam (multi-threaded)",
},
umap_params.optim_params.n_epochs,
graph.col_indices.len().separate_with_underscores()
);
}
match optimiser {
UmapOptimiser::Adam => optimise_embedding_adam(
&mut embd,
&graph_adj,
&umap_params.optim_params,
seed as u64,
verbose,
),
UmapOptimiser::Sgd => {
optimise_embedding_sgd(
&mut embd,
&graph_adj,
&umap_params.optim_params,
seed as u64,
verbose,
);
}
UmapOptimiser::AdamParallel => {
optimise_embedding_adam_parallel(
&mut embd,
&graph_adj,
&umap_params.optim_params,
seed as u64,
verbose,
);
}
}
if verbose {
println!(
"Initialised and optimised embedding in: {:.2?}.",
start_layout.elapsed()
);
println!("UMAP (GPU) complete!");
}
let n_samples = embd.len();
let mut transposed = vec![vec![T::zero(); n_samples]; umap_params.n_dim];
for sample_idx in 0..n_samples {
for dim_idx in 0..umap_params.n_dim {
transposed[dim_idx][sample_idx] = embd[sample_idx][dim_idx];
}
}
Ok(transposed)
}
#[cfg(feature = "gpu")]
#[derive(Debug, Clone)]
pub struct TsneParamsGpu<T> {
pub n_dim: usize,
pub perplexity: T,
pub ann_type: String,
pub initialisation: String,
pub init_range: Option<T>,
pub nn_params: NearestNeighbourParamsGpu<T>,
pub optim_params: TsneOptimParams<T>,
pub randomised_init: bool,
}
#[cfg(feature = "gpu")]
impl<T> TsneParamsGpu<T>
where
T: ManifoldsFloat,
{
#[allow(clippy::too_many_arguments)]
pub fn new(
n_dim: Option<usize>,
perplexity: Option<T>,
init_range: Option<T>,
lr: Option<T>,
n_epochs: Option<usize>,
ann_type: Option<String>,
theta: Option<T>,
n_interp_points: Option<usize>,
) -> Self {
let n_dim = n_dim.unwrap_or(2);
let perplexity = perplexity.unwrap_or_else(|| T::from_f64(30.0).unwrap());
let lr = lr.unwrap_or_else(|| T::from_f64(200.0).unwrap());
let n_epochs = n_epochs.unwrap_or(1000);
let ann_type = ann_type.unwrap_or_else(|| "ivf_gpu".to_string());
let theta = theta.unwrap_or_else(|| T::from_f64(0.5).unwrap());
let n_interp_points = n_interp_points.unwrap_or(3);
Self {
n_dim,
perplexity,
ann_type,
initialisation: "pca".to_string(),
init_range,
nn_params: NearestNeighbourParamsGpu::default(),
optim_params: TsneOptimParams {
n_epochs,
lr,
early_exag_iter: 250,
early_exag_factor: T::from_f64(12.0).unwrap(),
theta,
n_interp_points,
},
randomised_init: true,
}
}
}
#[cfg(feature = "gpu")]
#[allow(clippy::too_many_arguments)]
pub fn construct_tsne_graph_gpu<T, R>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
perplexity: T,
ann_type: String,
nn_params: &NearestNeighbourParamsGpu<T>,
device: R::Device,
seed: usize,
verbose: bool,
) -> TsneGraph<T>
where
T: ManifoldsFloat + AnnSearchGpuFloat,
R: Runtime,
NNDescentGpu<T, R>: NNDescentQuery<T>,
{
let (knn_indices, knn_dist) = match precomputed_knn {
Some((indices, distances)) => {
if verbose {
println!("Using precomputed kNN graph...");
}
(indices, distances)
}
None => {
let k_float = perplexity * T::from_f64(3.0).unwrap();
let k = k_float.to_usize().unwrap().max(5).min(data.nrows() - 1);
if verbose {
println!("Running GPU kNN search (k={}) using {}...", k, ann_type);
}
let start_knn = Instant::now();
let result =
run_ann_search_gpu::<T, R>(data, k, ann_type, nn_params, device, seed, verbose);
if verbose {
println!("GPU kNN search done in: {:.2?}.", start_knn.elapsed());
}
result
}
};
if verbose {
println!("Computing Gaussian affinities and symmetrising...");
}
let start_graph = Instant::now();
let directed_graph = gaussian_knn_affinities(
&knn_indices,
&knn_dist,
perplexity,
T::from_f64(1e-5).unwrap(),
200,
nn_params.dist_metric == "euclidean",
)?;
let graph = symmetrise_affinities_tsne(directed_graph);
if verbose {
println!(
"Finalised graph generation in {:.2?}.",
start_graph.elapsed()
);
}
Ok((graph, knn_indices, knn_dist))
}
#[cfg(all(feature = "gpu", feature = "fft_tsne"))]
pub fn tsne_gpu<T, R>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
params: &TsneParamsGpu<T>,
approx_type: &str,
device: R::Device,
seed: usize,
verbose: bool,
) -> Result<Vec<Vec<T>>, ManifoldsError>
where
T: ManifoldsFloat + AnnSearchGpuFloat + FftwFloat,
R: Runtime,
StandardNormal: Distribution<T>,
NNDescentGpu<T, R>: NNDescentQuery<T>,
{
if params.n_dim != 2 {
return Err(ManifoldsError::IncorrectDim {
n_dim: params.n_dim,
});
}
let (graph, _, _) = construct_tsne_graph_gpu::<T, R>(
data,
precomputed_knn,
params.perplexity,
params.ann_type.clone(),
¶ms.nn_params,
device,
seed,
verbose,
)?;
if verbose {
println!("Initialising embedding via {}...", ¶ms.initialisation);
}
let init_type = parse_initilisation(
¶ms.initialisation,
params.randomised_init,
params.init_range,
)
.unwrap_or(EmbdInit::PcaInit {
randomised: params.randomised_init,
range: Some(T::from_f64(1e-2).unwrap()),
});
let mut embd = initialise_embedding(&init_type, params.n_dim, seed as u64, &graph, data)?;
let tsne_approx = parse_tsne_optimiser(approx_type).unwrap_or_default();
let start_optim = Instant::now();
match tsne_approx {
TsneOpt::BarnesHut => {
if verbose {
println!(
"Optimising via Barnes-Hut t-SNE ({} epochs)...",
params.optim_params.n_epochs
);
}
optimise_bh_tsne(&mut embd, ¶ms.optim_params, &graph, verbose);
}
TsneOpt::Fft => {
if verbose {
println!(
"Optimising via FFT Interpolation-based t-SNE ({} epochs)...",
params.optim_params.n_epochs
);
}
optimise_fft_tsne(&mut embd, ¶ms.optim_params, &graph, verbose);
}
}
if verbose {
println!("Optimisation complete in {:.2?}.", start_optim.elapsed());
}
let n_samples = embd.len();
let mut transposed = vec![vec![T::zero(); n_samples]; params.n_dim];
for sample_idx in 0..n_samples {
for dim_idx in 0..params.n_dim {
transposed[dim_idx][sample_idx] = embd[sample_idx][dim_idx];
}
}
Ok(transposed)
}
#[cfg(all(feature = "gpu", not(feature = "fft_tsne")))]
pub fn tsne_gpu<T, R>(
data: MatRef<T>,
precomputed_knn: PreComputedKnn<T>,
params: &TsneParamsGpu<T>,
approx_type: &str,
device: R::Device,
seed: usize,
verbose: bool,
) -> Result<Vec<Vec<T>>, ManifoldsError>
where
T: ManifoldsFloat + AnnSearchGpuFloat,
R: Runtime,
StandardNormal: Distribution<T>,
NNDescentGpu<T, R>: NNDescentQuery<T>,
{
if params.n_dim != 2 {
return Err(ManifoldsError::IncorrectDim {
n_dim: params.n_dim,
});
}
let (graph, _, _) = construct_tsne_graph_gpu::<T, R>(
data,
precomputed_knn,
params.perplexity,
params.ann_type.clone(),
¶ms.nn_params,
device,
seed,
verbose,
)?;
if verbose {
println!("Initialising embedding via {}...", ¶ms.initialisation);
}
let init_type = parse_initilisation(
¶ms.initialisation,
params.randomised_init,
params.init_range,
)
.unwrap_or(EmbdInit::PcaInit {
randomised: params.randomised_init,
range: Some(T::from_f64(1e-2).unwrap()),
});
let mut embd = initialise_embedding(&init_type, params.n_dim, seed as u64, &graph, data)?;
let tsne_approx = parse_tsne_optimiser(approx_type).unwrap_or_default();
let start_optim = Instant::now();
match tsne_approx {
TsneOpt::BarnesHut => {
if verbose {
println!(
"Optimising via Barnes-Hut t-SNE ({} epochs)...",
params.optim_params.n_epochs
);
}
optimise_bh_tsne(&mut embd, ¶ms.optim_params, &graph, verbose);
}
TsneOpt::Fft => {
panic!("FFT-accelerated t-SNE not available. Recompile with 'fft_tsne' feature or use 'barnes_hut' approximation.");
}
}
if verbose {
println!("Optimisation complete in {:.2?}.", start_optim.elapsed());
}
let n_samples = embd.len();
let mut transposed = vec![vec![T::zero(); n_samples]; params.n_dim];
for sample_idx in 0..n_samples {
for dim_idx in 0..params.n_dim {
transposed[dim_idx][sample_idx] = embd[sample_idx][dim_idx];
}
}
Ok(transposed)
}