use ann_search_rs::cpu::hnsw::{HnswIndex, HnswState};
use ann_search_rs::cpu::nndescent::{ApplySortedUpdates, NNDescent, NNDescentQuery};
use ann_search_rs::prelude::KMeansTrainingParams;
use ann_search_rs::utils::dist::{parse_ann_dist, Dist};
use ann_search_rs::utils::k_means_utils::{assign_all_parallel, train_centroids};
use faer::MatRef;
use faer_traits::ComplexField;
use num_traits::Float;
use rand::prelude::Distribution;
use rand::rngs::StdRng;
use rand::seq::index::sample;
use rand::seq::SliceRandom;
use rand::SeedableRng;
use rand_distr::weighted::WeightedIndex;
use rayon::prelude::*;
use rustc_hash::{FxBuildHasher, FxHashSet};
use std::ops::{AddAssign, Range};
use std::time::Instant;
use crate::data::graph::phate_alpha_decay_affinities;
use crate::data::structures::*;
use crate::prelude::*;
use crate::utils::math::*;
use crate::utils::sparse_ops::*;
pub const PHATE_MAX_T: usize = 100;
#[derive(Debug, Clone)]
pub struct PhateDiffusionParams<T> {
pub decay: Option<T>,
pub bandwidth_scale: T,
pub thresh: T,
pub graph_symmetry: String,
pub n_landmarks: Option<usize>,
pub landmark_method: String,
pub n_svd: Option<usize>,
pub t: PhateTime,
pub gamma: T,
}
impl<T> PhateDiffusionParams<T> {
#[allow(clippy::too_many_arguments)]
pub fn new(
decay: Option<T>,
bandwidth_scale: T,
thresh: T,
graph_symmetry: String,
n_landmarks: Option<usize>,
landmark_method: String,
n_svd: Option<usize>,
t_max: Option<usize>,
t_custom: Option<usize>,
gamma: T,
) -> Self {
let t_max = t_max.unwrap_or(PHATE_MAX_T);
let t = parse_phate_time(t_custom, t_max);
Self {
decay,
bandwidth_scale,
thresh,
graph_symmetry,
n_landmarks,
landmark_method,
n_svd,
t,
gamma,
}
}
}
pub fn build_diffusion_operator<T>(matrix: &CompressedSparseData<T>) -> CompressedSparseData<T>
where
T: ManifoldsFloat,
{
if !matrix.cs_type.is_csr() {
panic!("Diffusion operator requires CSR format input");
}
let (rows, _) = matrix.shape();
let degrees: Vec<T> = (0..rows)
.into_par_iter()
.map(|i| {
let start = matrix.indptr[i];
let end = matrix.indptr[i + 1];
let mut sum = T::zero();
for idx in start..end {
sum += matrix.data[idx];
}
sum
})
.collect();
let norm_data: Vec<T> = (0..rows)
.into_par_iter()
.flat_map(|i| {
let start = matrix.indptr[i];
let end = matrix.indptr[i + 1];
let deg = degrees[i];
let mut row_vals = Vec::with_capacity(end - start);
if deg > T::zero() {
for idx in start..end {
row_vals.push(matrix.data[idx] / deg);
}
} else {
for idx in start..end {
row_vals.push(matrix.data[idx]);
}
}
row_vals
})
.collect();
CompressedSparseData::new_csr(&norm_data, &matrix.indices, &matrix.indptr, matrix.shape())
}
pub enum PhateDiffusion<T>
where
T: ComplexField + Float,
{
Full {
operator: CompressedSparseData<T>,
},
Landmark {
landmarks: PhateLandmarks<T>,
},
}
#[derive(Debug, Clone)]
pub enum PhateTime {
Auto {
t_max: usize,
},
Fixed(usize),
}
impl Default for PhateTime {
fn default() -> Self {
PhateTime::Auto { t_max: 100 }
}
}
pub fn parse_phate_time(t_custom: Option<usize>, t_max: usize) -> PhateTime {
match t_custom {
Some(t) => PhateTime::Fixed(t),
None => PhateTime::Auto { t_max },
}
}
#[derive(Debug, Clone)]
pub enum LandmarkMethod {
Random {
seed: u64,
},
Spectral {
n_svd: usize,
},
Density {
seed: u64,
},
}
impl Default for LandmarkMethod {
fn default() -> Self {
LandmarkMethod::Spectral { n_svd: 100 }
}
}
type FlattenData<T> = (Vec<T>, usize, usize);
pub fn parse_landmark_method(
s: &str,
seed: Option<usize>,
n_svd: Option<usize>,
) -> Option<LandmarkMethod> {
let seed = seed.unwrap_or(42);
let n_svd = n_svd.unwrap_or(10);
match s.to_lowercase().as_str() {
"random" => Some(LandmarkMethod::Random { seed: seed as u64 }),
"spectral" => Some(LandmarkMethod::Spectral { n_svd }),
"density" => Some(LandmarkMethod::Density { seed: seed as u64 }),
_ => None,
}
}
fn matrix_to_flat<T>(mat: MatRef<T>) -> FlattenData<T>
where
T: Float,
{
let n = mat.nrows();
let dim = mat.ncols();
let mut vectors_flat = Vec::with_capacity(n * dim);
for i in 0..n {
vectors_flat.extend(mat.row(i).iter().cloned());
}
(vectors_flat, n, dim)
}
fn random_sample(range: Range<usize>, n: usize, seed: u64) -> Vec<usize> {
let mut rng = StdRng::seed_from_u64(seed);
sample(&mut rng, range.len(), n)
.into_iter()
.map(|i| range.start + i)
.collect()
}
fn assign_to_landmark<T>(
data: &[T],
norm_data: &T,
landmark_data: &[T],
norm_landmark: &[T],
dist: &Dist,
n_landmark: usize,
dim: usize,
) -> usize
where
T: ManifoldsFloat,
{
let mut min_dist = T::infinity();
let mut min_idx = 0;
for idx in 0..n_landmark {
let landmark = &landmark_data[idx * dim..(idx + 1) * dim];
let dist = match dist {
Dist::SquaredEuclidean => T::euclidean_simd(landmark, data),
Dist::Cosine => {
let dot = T::dot_simd(landmark, data);
T::one() - (dot / (*norm_data * norm_landmark[idx]))
}
Dist::Manhattan => unreachable!(),
};
if dist < min_dist {
min_dist = dist;
min_idx = idx;
}
}
min_idx
}
fn build_landmarks_to_data<T>(
diffusion_op: &CompressedSparseData<T>,
assignments: &[usize],
n_landmarks: usize,
) -> CompressedSparseData<T>
where
T: ManifoldsFloat,
{
let n = assignments.len();
let mut landmark_pts: Vec<Vec<usize>> = vec![Vec::new(); n_landmarks];
for (point_idx, &landmark_idx) in assignments.iter().enumerate() {
landmark_pts[landmark_idx].push(point_idx);
}
let rows: Vec<SparseRow<T>> = (0..n_landmarks)
.into_par_iter()
.map(|landmark_idx| sparse_row_sum(diffusion_op, &landmark_pts[landmark_idx]))
.collect();
sparse_row_to_csr(&rows, n)
}
pub fn find_knee_point<T>(y: &[T]) -> usize
where
T: Float,
{
let n = y.len();
if n < 3 {
panic!("Cannot find knee point on vector of length < 3");
}
let x: Vec<T> = (0..n).map(|i| T::from(i).unwrap()).collect();
let mut sigma_x = Vec::with_capacity(n);
let mut sigma_y = Vec::with_capacity(n);
let mut sigma_xy = Vec::with_capacity(n);
let mut sigma_xx = Vec::with_capacity(n);
let mut sum_x = T::zero();
let mut sum_y = T::zero();
let mut sum_xy = T::zero();
let mut sum_xx = T::zero();
for i in 0..n {
sum_x = sum_x + x[i];
sum_y = sum_y + y[i];
sum_xy = sum_xy + x[i] * y[i];
sum_xx = sum_xx + x[i] * x[i];
sigma_x.push(sum_x);
sigma_y.push(sum_y);
sigma_xy.push(sum_xy);
sigma_xx.push(sum_xx);
}
let mut mfwd = Vec::with_capacity(n - 1);
let mut bfwd = Vec::with_capacity(n - 1);
for i in 1..n {
let n_points = T::from(i + 1).unwrap();
let det = n_points * sigma_xx[i] - sigma_x[i] * sigma_x[i];
if det.abs() > T::epsilon() {
let m = (n_points * sigma_xy[i] - sigma_x[i] * sigma_y[i]) / det;
let b = (sigma_xx[i] * sigma_y[i] - sigma_x[i] * sigma_xy[i]) / det;
mfwd.push(m);
bfwd.push(b);
} else {
mfwd.push(T::zero());
bfwd.push(y[0]);
}
}
let x_rev: Vec<T> = x.iter().rev().copied().collect();
let y_rev: Vec<T> = y.iter().rev().copied().collect();
let mut sigma_x_rev = Vec::with_capacity(n);
let mut sigma_y_rev = Vec::with_capacity(n);
let mut sigma_xy_rev = Vec::with_capacity(n);
let mut sigma_xx_rev = Vec::with_capacity(n);
sum_x = T::zero();
sum_y = T::zero();
sum_xy = T::zero();
sum_xx = T::zero();
for i in 0..n {
sum_x = sum_x + x_rev[i];
sum_y = sum_y + y_rev[i];
sum_xy = sum_xy + x_rev[i] * y_rev[i];
sum_xx = sum_xx + x_rev[i] * x_rev[i];
sigma_x_rev.push(sum_x);
sigma_y_rev.push(sum_y);
sigma_xy_rev.push(sum_xy);
sigma_xx_rev.push(sum_xx);
}
let mut mbck = Vec::with_capacity(n - 1);
let mut bbck = Vec::with_capacity(n - 1);
for i in 1..n {
let n_points = T::from(i + 1).unwrap();
let det = n_points * sigma_xx_rev[i] - sigma_x_rev[i] * sigma_x_rev[i];
if det.abs() > T::epsilon() {
let m = (n_points * sigma_xy_rev[i] - sigma_x_rev[i] * sigma_y_rev[i]) / det;
let b = (sigma_xx_rev[i] * sigma_y_rev[i] - sigma_x_rev[i] * sigma_xy_rev[i]) / det;
mbck.push(m);
bbck.push(b);
} else {
mbck.push(T::zero());
bbck.push(y_rev[0]);
}
}
mbck.reverse();
bbck.reverse();
let mut error_curve = vec![T::infinity(); n];
for breakpt in 1..n - 1 {
let mut error = T::zero();
for i in 0..=breakpt {
let predicted = mfwd[breakpt - 1] * x[i] + bfwd[breakpt - 1];
error = error + (predicted - y[i]).abs();
}
for i in breakpt..n {
let predicted = mbck[breakpt - 1] * x[i] + bbck[breakpt - 1];
error = error + (predicted - y[i]).abs();
}
error_curve[breakpt] = error;
}
let mut min_idx = 1;
let mut min_error = error_curve[1];
for (i, &err) in error_curve.iter().enumerate().skip(1).take(n - 2) {
if err < min_error {
min_error = err;
min_idx = i;
}
}
min_idx
}
#[allow(dead_code)]
pub struct PhateLandmarks<T>
where
T: Float + ComplexField,
{
n_landmarks: usize,
method: LandmarkMethod,
assignments: Vec<usize>,
landmark_op: CompressedSparseData<T>,
transitions: CompressedSparseData<T>,
}
impl<T> PhateLandmarks<T>
where
T: ManifoldsFloat,
{
#[allow(clippy::too_many_arguments)]
pub fn build(
data: MatRef<T>,
affinity: &CompressedSparseData<T>,
diffusion_op: &CompressedSparseData<T>,
n_landmarks: usize,
method: &str,
distance: &str,
seed: usize,
n_svd: Option<usize>,
verbose: usize,
) -> Result<Self, ManifoldsError> {
let verbosity = parse_verbosity_level(verbose);
let (data, n, dim) = matrix_to_flat(data);
let landmark_method = parse_landmark_method(method, Some(seed), n_svd).unwrap_or_default();
let distance = parse_ann_dist(distance).unwrap_or_default();
let assignments: Vec<usize> = match landmark_method {
LandmarkMethod::Random { seed } => {
if verbosity.normal_verbosity() {
println!(" Using random selection of landmarks.")
}
let landmark_indices = random_sample(0..n, n_landmarks, seed);
let landmark_data: Vec<T> = landmark_indices
.iter()
.flat_map(|&i| data[i * dim..(i + 1) * dim].iter().copied())
.collect();
let norm_data = match distance {
Dist::Cosine => (0..n)
.into_par_iter()
.map(|i| T::calculate_l2_norm(&data[i * dim..(i + 1) * dim]))
.collect::<Vec<_>>(),
Dist::SquaredEuclidean => Vec::new(),
Dist::Manhattan => {
unreachable!("PHATE Landmark does not support Manhattan distance.")
}
};
let norm_landmark = match distance {
Dist::Cosine => (0..n_landmarks)
.into_par_iter()
.map(|i| T::calculate_l2_norm(&landmark_data[i * dim..(i + 1) * dim]))
.collect::<Vec<_>>(),
Dist::SquaredEuclidean => Vec::new(),
Dist::Manhattan => {
unreachable!("PHATE Landmark does not support Manhattan distance.")
}
};
(0..n)
.into_par_iter()
.map(|i| {
let norm = if matches!(distance, Dist::SquaredEuclidean) {
T::zero() } else {
norm_data[i]
};
assign_to_landmark(
&data[i * dim..(i + 1) * dim],
&norm,
&landmark_data,
&norm_landmark,
&distance,
n_landmarks,
dim,
)
})
.collect()
}
#[allow(unused_variables)]
LandmarkMethod::Spectral { n_svd } => {
if verbosity.normal_verbosity() {
println!(" Using spectral detection of landmarks.")
}
let svd = sparse_randomised_svd(affinity, n_svd, seed as u64, None, None)?;
if verbosity.detailed_verbosity() {
println!(" Finished calculation of randomised SVD on the affinity matrix.")
}
let v = &svd.v;
let k = v.ncols();
let mut embedding_flat: Vec<T> = vec![T::zero(); n * k];
for i in 0..n {
for idx in diffusion_op.indptr[i]..diffusion_op.indptr[i + 1] {
let j = diffusion_op.indices[idx];
let a_val = diffusion_op.data[idx];
for col in 0..k {
embedding_flat[i * k + col] += a_val * v[(j, col)];
}
}
}
let k_means_params = KMeansTrainingParams::new(30, None, None);
let centroids = train_centroids(
&embedding_flat,
k,
n,
n_landmarks,
&Dist::SquaredEuclidean,
Some(k_means_params),
seed,
verbosity.detailed_verbosity(),
)?;
if verbosity.detailed_verbosity() {
println!(" Centroids identified.")
}
let centroid_norms = vec![T::one(); n_landmarks]; let data_norms = vec![T::one(); n];
let assignemnts = assign_all_parallel(
&embedding_flat,
&data_norms,
k,
n,
¢roids,
¢roid_norms,
n_landmarks,
&Dist::SquaredEuclidean,
);
if verbosity.normal_verbosity() {
println!(" Landmark assignments done.")
}
assignemnts
}
LandmarkMethod::Density { seed } => {
if verbosity.normal_verbosity() {
println!(" Using degree-weighted (density) selection of landmarks.")
}
let weights: Vec<f64> = (0..n)
.into_par_iter()
.map(|i| {
let start = affinity.indptr[i];
let end = affinity.indptr[i + 1];
let mut sum = T::zero();
for idx in start..end {
sum += affinity.data[idx];
}
sum.to_f64().unwrap_or(0.0).sqrt()
})
.collect();
let mut rng = StdRng::seed_from_u64(seed);
let dist = WeightedIndex::new(&weights).expect("Failed to create weighted index. Check affinity matrix for negative/NaN values.");
let mut landmark_set =
FxHashSet::with_capacity_and_hasher(n_landmarks, FxBuildHasher);
while landmark_set.len() < n_landmarks {
landmark_set.insert(dist.sample(&mut rng));
}
let landmark_indices: Vec<usize> = landmark_set.into_iter().collect();
let landmark_data: Vec<T> = landmark_indices
.iter()
.flat_map(|&i| data[i * dim..(i + 1) * dim].iter().copied())
.collect();
let norm_data = match distance {
Dist::Cosine => (0..n)
.into_par_iter()
.map(|i| T::calculate_l2_norm(&data[i * dim..(i + 1) * dim]))
.collect::<Vec<_>>(),
Dist::SquaredEuclidean => Vec::new(),
Dist::Manhattan => {
unreachable!("PHATE Landmark does not support Manhattan distance.")
}
};
let norm_landmark = match distance {
Dist::Cosine => (0..n_landmarks)
.into_par_iter()
.map(|i| T::calculate_l2_norm(&landmark_data[i * dim..(i + 1) * dim]))
.collect::<Vec<_>>(),
Dist::SquaredEuclidean => Vec::new(),
Dist::Manhattan => {
unreachable!("PHATE Landmark does not support Manhattan distance.")
}
};
(0..n)
.into_par_iter()
.map(|i| {
let norm = if matches!(distance, Dist::SquaredEuclidean) {
T::zero() } else {
norm_data[i]
};
assign_to_landmark(
&data[i * dim..(i + 1) * dim],
&norm,
&landmark_data,
&norm_landmark,
&distance,
n_landmarks,
dim,
)
})
.collect()
}
};
let mut p_mn = build_landmarks_to_data(diffusion_op, &assignments, n_landmarks);
let mut p_nm = p_mn.transpose();
normalise_csr_rows_l1(&mut p_mn)?;
normalise_csr_rows_l1(&mut p_nm)?;
let landmark_op = csr_matmul_csr(&p_mn, &p_nm)?;
Ok(Self {
n_landmarks,
method: landmark_method,
assignments,
landmark_op,
transitions: p_nm,
})
}
pub fn power(&self, t: usize) -> Result<CompressedSparseData<T>, ManifoldsError>
where
T: AddAssign,
{
if t == 0 {
unimplemented!("Return identity for t = 0");
} else if t == 1 {
return Ok(self.landmark_op.clone());
}
matrix_power(&self.landmark_op, t)
}
pub fn power_optimal(&self, t_max: usize) -> Result<CompressedSparseData<T>, ManifoldsError>
where
T: AddAssign,
{
let t_opt = self.find_optimal_t(t_max)?;
let res = matrix_power(&self.landmark_op, t_opt)?;
Ok(res)
}
pub fn interpolate(
&self,
landmark_diffusion: &CompressedSparseData<T>,
) -> Result<CompressedSparseData<T>, ManifoldsError>
where
T: AddAssign,
{
csr_matmul_csr(&self.transitions, landmark_diffusion)
}
pub fn find_optimal_t(&self, t_max: usize) -> Result<usize, ManifoldsError> {
let entropy = landmark_von_neumann_entropy(&self.landmark_op, t_max)?;
Ok(find_knee_point(&entropy))
}
pub fn interpolate_embedding(&self, landmark_embedding: &[Vec<T>]) -> Vec<Vec<T>>
where
T: AddAssign,
{
let n = self.transitions.shape().0;
let n_dim = landmark_embedding[0].len();
let mut embedding = vec![vec![T::zero(); n_dim]; n];
for i in 0..n {
let start = self.transitions.indptr[i];
let end = self.transitions.indptr[i + 1];
for idx in start..end {
let l = self.transitions.indices[idx];
let w = self.transitions.data[idx];
for d in 0..n_dim {
embedding[i][d] += w * landmark_embedding[l][d];
}
}
}
embedding
}
pub fn get_n_landmarks(&self) -> usize {
self.n_landmarks
}
}
pub fn apply_anisotropic_normalisation<T>(
kernel: &CompressedSparseData<T>,
alpha: T,
) -> Result<CompressedSparseData<T>, ManifoldsError>
where
T: ManifoldsFloat,
{
if !kernel.cs_type.is_csr() {
return Err(ManifoldsError::SparseMatrixIsNotCsr);
}
if alpha.abs() < T::epsilon() {
return Ok(kernel.clone());
}
let (nrows, _) = kernel.shape();
let q: Vec<T> = (0..nrows)
.into_par_iter()
.map(|i| {
let start = kernel.indptr[i];
let end = kernel.indptr[i + 1];
kernel.data[start..end].iter().copied().sum()
})
.collect();
let q_alpha: Vec<T> = q.par_iter().map(|&qi| qi.powf(alpha)).collect();
let data: Vec<T> = (0..nrows)
.into_par_iter()
.flat_map(|i| {
let start = kernel.indptr[i];
let end = kernel.indptr[i + 1];
(start..end)
.map(|idx| {
let j = kernel.indices[idx];
let denom = q_alpha[i] * q_alpha[j];
if denom > T::zero() {
kernel.data[idx] / denom
} else {
T::zero()
}
})
.collect::<Vec<T>>()
})
.collect();
Ok(CompressedSparseData::new_csr(
&data,
&kernel.indices,
&kernel.indptr,
kernel.shape(),
))
}
pub fn build_symmetric_diffusion_operator<T>(
kernel: &CompressedSparseData<T>,
) -> Result<(CompressedSparseData<T>, Vec<T>), ManifoldsError>
where
T: ManifoldsFloat,
{
if !kernel.cs_type.is_csr() {
return Err(ManifoldsError::SparseMatrixIsNotCsr);
}
let (nrows, _) = kernel.shape();
let degrees: Vec<T> = (0..nrows)
.into_par_iter()
.map(|i| {
let start = kernel.indptr[i];
let end = kernel.indptr[i + 1];
kernel.data[start..end].iter().copied().sum()
})
.collect();
let sqrt_d: Vec<T> = degrees.par_iter().map(|&d| d.sqrt()).collect();
let data: Vec<T> = (0..nrows)
.into_par_iter()
.flat_map(|i| {
let start = kernel.indptr[i];
let end = kernel.indptr[i + 1];
let sqrt_di = sqrt_d[i];
(start..end)
.map(|idx| {
let j = kernel.indices[idx];
let denom = sqrt_di * sqrt_d[j];
if denom > T::zero() {
kernel.data[idx] / denom
} else {
T::zero()
}
})
.collect::<Vec<T>>()
})
.collect();
let p_sym =
CompressedSparseData::new_csr(&data, &kernel.indices, &kernel.indptr, kernel.shape());
Ok((p_sym, sqrt_d))
}
#[allow(clippy::too_many_arguments)]
fn build_data_to_landmark_transitions<T>(
data: &[T],
n: usize,
landmark_data: &[T],
n_landmarks: usize,
dim: usize,
k: usize,
bandwidth_scale: T,
thresh: T,
distance: &Dist,
) -> CompressedSparseData<T>
where
T: ManifoldsFloat,
{
let k_used = k.min(n_landmarks).max(1);
let machine_epsilon = T::epsilon();
let data_norms: Vec<T> = match distance {
Dist::Cosine => (0..n)
.into_par_iter()
.map(|i| T::calculate_l2_norm(&data[i * dim..(i + 1) * dim]))
.collect(),
Dist::SquaredEuclidean => Vec::new(),
Dist::Manhattan => unreachable!(),
};
let landmark_norms: Vec<T> = match distance {
Dist::Cosine => (0..n_landmarks)
.into_par_iter()
.map(|l| T::calculate_l2_norm(&landmark_data[l * dim..(l + 1) * dim]))
.collect(),
Dist::SquaredEuclidean => Vec::new(),
Dist::Manhattan => unreachable!(),
};
let bw_factor = match distance {
Dist::SquaredEuclidean => bandwidth_scale * bandwidth_scale,
Dist::Cosine => bandwidth_scale,
Dist::Manhattan => unreachable!(),
};
let rows: Vec<(Vec<usize>, Vec<T>)> = (0..n)
.into_par_iter()
.map(|i| {
let x = &data[i * dim..(i + 1) * dim];
let mut dists: Vec<(usize, T)> = (0..n_landmarks)
.map(|l| {
let y = &landmark_data[l * dim..(l + 1) * dim];
let d = match distance {
Dist::SquaredEuclidean => T::euclidean_simd(x, y),
Dist::Cosine => {
let dot = T::dot_simd(x, y);
let denom = data_norms[i] * landmark_norms[l];
if denom > T::zero() {
T::one() - (dot / denom)
} else {
T::zero()
}
}
Dist::Manhattan => unreachable!(),
};
(l, d)
})
.collect();
dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
dists.truncate(k_used);
let bandwidth_raw = dists.last().map(|(_, d)| *d).unwrap_or(T::zero());
let bandwidth = (bandwidth_raw * bw_factor).max(machine_epsilon);
let mut kept: Vec<(usize, T)> = dists
.into_iter()
.filter_map(|(l, d)| {
let w = match distance {
Dist::SquaredEuclidean => (-(d / bandwidth)).exp(),
Dist::Cosine => {
let scaled = d / bandwidth;
(-(scaled * scaled)).exp()
}
Dist::Manhattan => unreachable!(),
};
if w >= thresh {
Some((l, w))
} else {
None
}
})
.collect();
let sum: T = kept.iter().map(|(_, w)| *w).sum();
if sum > T::zero() {
for (_, w) in &mut kept {
*w /= sum;
}
}
kept.sort_by_key(|(l, _)| *l);
let indices: Vec<usize> = kept.iter().map(|(l, _)| *l).collect();
let values: Vec<T> = kept.iter().map(|(_, w)| *w).collect();
(indices, values)
})
.collect();
let total_nnz: usize = rows.iter().map(|(i, _)| i.len()).sum();
let mut out_data = Vec::with_capacity(total_nnz);
let mut out_indices = Vec::with_capacity(total_nnz);
let mut out_indptr = Vec::with_capacity(n + 1);
out_indptr.push(0);
for (idx, vals) in rows {
out_data.extend(vals);
out_indices.extend(idx);
out_indptr.push(out_data.len());
}
CompressedSparseData::new_csr(&out_data, &out_indices, &out_indptr, (n, n_landmarks))
}
#[allow(dead_code)]
pub struct DiffusionMapsLandmarks<T>
where
T: Float + ComplexField,
{
n_landmarks: usize,
method: LandmarkMethod,
landmark_indices: Vec<usize>,
p_sym_ll: CompressedSparseData<T>,
sqrt_degrees_l: Vec<T>,
p_nl: CompressedSparseData<T>,
}
impl<T> DiffusionMapsLandmarks<T>
where
T: ManifoldsFloat,
{
#[allow(clippy::too_many_arguments)]
pub fn build(
data: MatRef<T>,
affinity: Option<&CompressedSparseData<T>>,
n_landmarks: usize,
method: &str,
distance: &str,
alpha_norm: T,
k: usize,
bandwidth_scale: T,
thresh: T,
graph_symmetry: &str,
seed: usize,
n_svd: Option<usize>,
verbose: usize,
) -> Result<Self, ManifoldsError>
where
HnswIndex<T>: HnswState<T>,
NNDescent<T>: ApplySortedUpdates<T> + NNDescentQuery<T>,
{
let verbosity = parse_verbosity_level(verbose);
let start = Instant::now();
let (data_flat, n, dim) = matrix_to_flat(data);
let landmark_method = parse_landmark_method(method, Some(seed), n_svd).unwrap_or_default();
let dist_enum = parse_ann_dist(distance).unwrap_or_default();
let landmark_indices: Vec<usize> = match landmark_method {
LandmarkMethod::Random { seed } => {
if verbosity.normal_verbosity() {
println!(" Using random landmark selection.");
}
random_sample(0..n, n_landmarks, seed)
}
LandmarkMethod::Density { seed } => {
if verbosity.normal_verbosity() {
println!(" Using density landmark selection.");
}
let affinity = affinity.expect("density landmarks require affinity matrix");
let weights: Vec<f64> = (0..n)
.into_par_iter()
.map(|i| {
let start = affinity.indptr[i];
let end = affinity.indptr[i + 1];
let mut sum = T::zero();
for idx in start..end {
sum += affinity.data[idx];
}
sum.to_f64().unwrap_or(0.0).sqrt()
})
.collect();
let mut rng = StdRng::seed_from_u64(seed);
let weighted = WeightedIndex::new(&weights)
.expect("Failed to create weighted index for density landmarks");
let mut set = FxHashSet::with_capacity_and_hasher(n_landmarks, FxBuildHasher);
while set.len() < n_landmarks {
set.insert(weighted.sample(&mut rng));
}
set.into_iter().collect()
}
LandmarkMethod::Spectral { n_svd } => {
if verbosity.normal_verbosity() {
println!(" Using spectral landmark selection.");
}
let affinity = affinity.expect("spectral landmarks require affinity matrix");
let svd = sparse_randomised_svd(affinity, n_svd, seed as u64, None, None)?;
let v = &svd.v;
let k_proj = v.ncols();
let mut embedding_flat: Vec<T> = vec![T::zero(); n * k_proj];
for i in 0..n {
for idx in affinity.indptr[i]..affinity.indptr[i + 1] {
let j = affinity.indices[idx];
let a_val = affinity.data[idx];
for col in 0..k_proj {
embedding_flat[i * k_proj + col] += a_val * v[(j, col)];
}
}
}
let k_means_params = KMeansTrainingParams::new(30, None, None);
let centroids = train_centroids(
&embedding_flat,
k_proj,
n,
n_landmarks,
&Dist::SquaredEuclidean,
Some(k_means_params),
seed,
verbosity.detailed_verbosity(),
)?;
let centroid_norms = vec![T::one(); n_landmarks];
let data_norms = vec![T::one(); n];
let assignments = assign_all_parallel(
&embedding_flat,
&data_norms,
k_proj,
n,
¢roids,
¢roid_norms,
n_landmarks,
&Dist::SquaredEuclidean,
);
let mut indices: Vec<usize> = Vec::with_capacity(n_landmarks);
for cluster_idx in 0..n_landmarks {
let centroid = ¢roids[cluster_idx * k_proj..(cluster_idx + 1) * k_proj];
let mut best: Option<(usize, T)> = None;
for (point_idx, &c) in assignments.iter().enumerate() {
if c == cluster_idx {
let point =
&embedding_flat[point_idx * k_proj..(point_idx + 1) * k_proj];
let d = T::euclidean_simd(centroid, point);
if best.is_none_or(|(_, bd)| d < bd) {
best = Some((point_idx, d));
}
}
}
if let Some((idx, _)) = best {
if !indices.contains(&idx) {
indices.push(idx);
}
}
}
if indices.len() < n_landmarks {
let mut rng = StdRng::seed_from_u64(seed as u64 + 1);
let mut remaining: Vec<usize> =
(0..n).filter(|i| !indices.contains(i)).collect();
remaining.shuffle(&mut rng);
while indices.len() < n_landmarks {
if let Some(pick) = remaining.pop() {
indices.push(pick);
} else {
break;
}
}
}
indices
}
};
let landmark_data: Vec<T> = landmark_indices
.iter()
.flat_map(|&i| data_flat[i * dim..(i + 1) * dim].iter().copied())
.collect();
let landmark_mat =
faer::Mat::<T>::from_fn(n_landmarks, dim, |i, j| landmark_data[i * dim + j]);
let k_ll = k.min(n_landmarks.saturating_sub(1)).max(1);
let nn_params = NearestNeighbourParams {
dist_metric: distance.to_string(),
..Default::default()
};
let (ll_indices, ll_dists) = run_ann_search(
landmark_mat.as_ref(),
k_ll,
"kmknn".to_string(),
&nn_params,
seed,
verbose,
)?;
let ll_graph = phate_alpha_decay_affinities(
&ll_indices,
&ll_dists,
k_ll,
Some(T::from_f64(2.0).unwrap()),
bandwidth_scale,
thresh,
graph_symmetry,
distance == "euclidean",
);
let kernel_ll = coo_to_csr(&ll_graph);
let kernel_ll_norm = apply_anisotropic_normalisation(&kernel_ll, alpha_norm)?;
let (p_sym_ll, sqrt_degrees_l) = build_symmetric_diffusion_operator(&kernel_ll_norm)?;
let p_nl = build_data_to_landmark_transitions(
&data_flat,
n,
&landmark_data,
n_landmarks,
dim,
k,
bandwidth_scale,
thresh,
&dist_enum,
);
if verbosity.normal_verbosity() {
println!(" Finished landmark generation in {:.2?}", start.elapsed());
}
Ok(Self {
n_landmarks,
method: landmark_method,
landmark_indices,
p_sym_ll,
sqrt_degrees_l,
p_nl,
})
}
pub fn eigendecompose(
&self,
n_dim: usize,
seed: u64,
) -> Result<(Vec<f32>, Vec<Vec<f32>>), ManifoldsError> {
compute_largest_eigenpairs_lanczos(&self.p_sym_ll, n_dim + 1, seed)
}
pub fn find_optimal_t(&self, t_max: usize) -> Result<usize, ManifoldsError> {
let entropy = landmark_von_neumann_entropy(&self.p_sym_ll, t_max)?;
Ok(find_knee_point(&entropy))
}
pub fn compute_landmark_embedding(
&self,
evals: &[f32],
evecs: &[Vec<f32>],
n_dim: usize,
t: usize,
) -> (Vec<Vec<T>>, Vec<T>) {
let mut embedding: Vec<Vec<T>> = vec![vec![T::zero(); n_dim]; self.n_landmarks];
let mut lambdas: Vec<T> = Vec::with_capacity(n_dim);
for comp_idx in 1..=n_dim {
let lambda = T::from_f32(evals[comp_idx]).unwrap();
let lambda_t = lambda.powi(t as i32);
lambdas.push(lambda);
let mut max_abs = 0.0f32;
let mut sign = 1.0f32;
for l in 0..self.n_landmarks {
let v = evecs[l][comp_idx];
if v.abs() > max_abs {
max_abs = v.abs();
sign = if v >= 0.0 { 1.0 } else { -1.0 };
}
}
for l in 0..self.n_landmarks {
let u = T::from_f32(evecs[l][comp_idx] * sign).unwrap();
embedding[l][comp_idx - 1] = lambda_t * u / self.sqrt_degrees_l[l];
}
}
(embedding, lambdas)
}
pub fn nystrom_extend(&self, landmark_embedding: &[Vec<T>], lambdas: &[T]) -> Vec<Vec<T>> {
let n = self.p_nl.shape().0;
let n_dim = landmark_embedding[0].len();
let mut out = vec![vec![T::zero(); n_dim]; n];
for i in 0..n {
let start = self.p_nl.indptr[i];
let end = self.p_nl.indptr[i + 1];
for idx in start..end {
let l = self.p_nl.indices[idx];
let w = self.p_nl.data[idx];
for d in 0..n_dim {
out[i][d] += w * landmark_embedding[l][d];
}
}
}
for i in 0..n {
for d in 0..n_dim {
if lambdas[d].abs() > T::epsilon() {
out[i][d] /= lambdas[d];
}
}
}
out
}
pub fn get_n_landmarks(&self) -> usize {
self.n_landmarks
}
pub fn get_landmark_indices(&self) -> &[usize] {
&self.landmark_indices
}
}
#[cfg(test)]
mod test_data_diffusion {
use super::*;
#[test]
fn test_diffusion_operator_normalization() {
let data = vec![10.0, 30.0, 5.0];
let indices = vec![1, 2, 0];
let indptr = vec![0, 2, 3, 3];
let kernel = CompressedSparseData::new_csr(&data, &indices, &indptr, (3, 3));
let diffusion_op = build_diffusion_operator(&kernel);
assert_eq!(diffusion_op.data[0], 0.25);
assert_eq!(diffusion_op.data[1], 0.75);
assert_eq!(diffusion_op.data[2], 1.0);
let row0_sum = diffusion_op.data[0] + diffusion_op.data[1];
assert!((row0_sum - 1.0).abs() < 1e-6);
}
#[test]
fn test_diffusion_operator_zero_degree_safety() {
let data = vec![2.0, 2.0];
let indices = vec![0, 2];
let indptr = vec![0, 0, 2];
let kernel = CompressedSparseData::new_csr(&data, &indices, &indptr, (2, 3));
let diffusion_op = build_diffusion_operator(&kernel);
assert_eq!(diffusion_op.indptr[0], 0);
assert_eq!(diffusion_op.indptr[1], 0);
assert_eq!(diffusion_op.data[0], 0.5);
assert_eq!(diffusion_op.data[1], 0.5);
}
#[test]
fn test_anisotropic_alpha_zero_is_identity() {
let data = vec![0.5, 0.3, 0.2, 0.7];
let indices = vec![0, 1, 0, 1];
let indptr = vec![0, 2, 4];
let kernel = CompressedSparseData::new_csr(&data, &indices, &indptr, (2, 2));
let result = apply_anisotropic_normalisation(&kernel, 0.0).unwrap();
assert_eq!(result.data, kernel.data);
assert_eq!(result.indices, kernel.indices);
assert_eq!(result.indptr, kernel.indptr);
}
#[test]
fn test_anisotropic_alpha_one_values() {
let data = vec![1.0, 2.0, 2.0, 1.0];
let indices = vec![0, 1, 0, 1];
let indptr = vec![0, 2, 4];
let kernel = CompressedSparseData::new_csr(&data, &indices, &indptr, (2, 2));
let result = apply_anisotropic_normalisation(&kernel, 1.0).unwrap();
use approx::assert_relative_eq;
assert_relative_eq!(result.data[0], 1.0 / 9.0, epsilon = 1e-10);
assert_relative_eq!(result.data[1], 2.0 / 9.0, epsilon = 1e-10);
assert_relative_eq!(result.data[2], 2.0 / 9.0, epsilon = 1e-10);
assert_relative_eq!(result.data[3], 1.0 / 9.0, epsilon = 1e-10);
}
#[test]
fn test_anisotropic_preserves_symmetry() {
let data = vec![1.0, 0.5, 0.5, 1.0, 0.3, 0.3, 1.0];
let indices = vec![0, 1, 0, 1, 2, 1, 2];
let indptr = vec![0, 2, 5, 7];
let kernel = CompressedSparseData::new_csr(&data, &indices, &indptr, (3, 3));
let result = apply_anisotropic_normalisation(&kernel, 0.5).unwrap();
let dense = result.to_dense();
use approx::assert_relative_eq;
for i in 0..3 {
for j in 0..3 {
assert_relative_eq!(dense[(i, j)], dense[(j, i)], epsilon = 1e-12);
}
}
}
#[test]
fn test_symmetric_operator_is_symmetric() {
let data = vec![1.0, 0.5, 0.5, 1.0, 0.3, 0.3, 1.0];
let indices = vec![0, 1, 0, 1, 2, 1, 2];
let indptr = vec![0, 2, 5, 7];
let kernel = CompressedSparseData::new_csr(&data, &indices, &indptr, (3, 3));
let (p_sym, _) = build_symmetric_diffusion_operator(&kernel).unwrap();
let dense = p_sym.to_dense();
use approx::assert_relative_eq;
for i in 0..3 {
for j in 0..3 {
assert_relative_eq!(dense[(i, j)], dense[(j, i)], epsilon = 1e-12);
}
}
}
#[test]
fn test_symmetric_operator_sqrt_degrees() {
let data = vec![1.0, 0.5, 0.5, 1.0, 0.3, 0.3, 1.0];
let indices = vec![0, 1, 0, 1, 2, 1, 2];
let indptr = vec![0, 2, 5, 7];
let kernel = CompressedSparseData::new_csr(&data, &indices, &indptr, (3, 3));
let (_, sqrt_d) = build_symmetric_diffusion_operator(&kernel).unwrap();
use approx::assert_relative_eq;
assert_relative_eq!(sqrt_d[0], 1.5_f64.sqrt(), epsilon = 1e-10);
assert_relative_eq!(sqrt_d[1], 1.8_f64.sqrt(), epsilon = 1e-10);
assert_relative_eq!(sqrt_d[2], 1.3_f64.sqrt(), epsilon = 1e-10);
}
#[test]
fn test_symmetric_operator_largest_eigenvalue_is_one() {
use crate::utils::math::compute_largest_eigenpairs_lanczos;
let data = vec![1.0, 0.5, 0.5, 1.0, 0.3, 0.3, 1.0];
let indices = vec![0, 1, 0, 1, 2, 1, 2];
let indptr = vec![0, 2, 5, 7];
let kernel = CompressedSparseData::new_csr(&data, &indices, &indptr, (3, 3));
let (p_sym, _) = build_symmetric_diffusion_operator(&kernel).unwrap();
let (evals, _) = compute_largest_eigenpairs_lanczos(&p_sym, 1, 42).unwrap();
use approx::assert_relative_eq;
assert_relative_eq!(evals[0], 1.0, epsilon = 1e-4);
}
}