#![allow(clippy::multiple_crate_versions)]
mod adam;
mod distance;
mod gradient;
pub mod knn;
mod neighbors;
mod sampling;
mod weights;
#[cfg(test)]
mod tests;
use bon::Builder;
use ndarray::{s, Array1, Array2, Array3, ArrayView2, Axis, Zip};
use ndarray_rand::rand_distr::{Normal, NormalError};
use ndarray_rand::RandomExt;
use petal_decomposition::{DecompositionError, Pca, RandomizedPca, RandomizedPcaBuilder};
use rand::rngs::SmallRng;
use rand::SeedableRng;
use rand_pcg::Mcg128Xsl64;
use std::cmp::min;
use std::time::Instant;
use thiserror::Error;
use tracing::{debug, warn};
use crate::adam::update_embedding_adam;
use crate::gradient::pacmap_grad;
use crate::knn::KnnError;
use crate::neighbors::{generate_pair_no_neighbors, generate_pairs};
use crate::weights::find_weights;
#[derive(Builder, Clone, Debug)]
pub struct Configuration {
#[builder(default = 2)]
pub embedding_dimensions: usize,
#[builder(default)]
pub initialization: Initialization,
#[builder(default = 0.5)]
pub mid_near_ratio: f32,
#[builder(default = 2.0)]
pub far_pair_ratio: f32,
pub override_neighbors: Option<usize>,
pub seed: Option<u64>,
#[builder(default)]
pub pair_configuration: PairConfiguration,
#[builder(default = 1.0)]
pub learning_rate: f32,
#[builder(default = (100, 100, 250))]
pub num_iters: (usize, usize, usize),
pub snapshots: Option<Vec<usize>>,
#[builder(default = 8_000)]
pub approx_threshold: usize,
}
impl Default for Configuration {
fn default() -> Self {
Self {
embedding_dimensions: 2,
initialization: Initialization::default(),
mid_near_ratio: 0.5,
far_pair_ratio: 2.0,
override_neighbors: None,
seed: None,
pair_configuration: PairConfiguration::default(),
learning_rate: 1.0,
num_iters: (100, 100, 250),
snapshots: None,
approx_threshold: 8_000,
}
}
}
#[derive(Clone, Debug, Default)]
#[non_exhaustive]
pub enum Initialization {
#[default]
Pca,
Value(Array2<f32>),
Random(Option<u64>),
}
#[derive(Clone, Debug, Default)]
#[non_exhaustive]
pub enum PairConfiguration {
#[default]
Generate,
NeighborsProvided {
pair_neighbors: Array2<u32>,
},
AllProvided {
pair_neighbors: Array2<u32>,
pair_mn: Array2<u32>,
pair_fp: Array2<u32>,
},
}
pub fn fit_transform(
x: ArrayView2<f32>,
config: Configuration,
) -> Result<(Array2<f32>, Option<Array3<f32>>), PaCMapError> {
let (n, dim) = x.dim();
if n <= 1 {
return Err(PaCMapError::SampleSize);
}
let PreprocessingResult {
x,
pca_solution,
maybe_transform,
..
} = preprocess_x(
x,
matches!(config.initialization, Initialization::Pca),
dim,
config.embedding_dimensions,
config.seed,
)?;
let embedding_init = if pca_solution {
YInit::Preprocessed
} else {
match config.initialization {
Initialization::Pca => {
let transform = maybe_transform.ok_or(PaCMapError::MissingTransform)?;
YInit::DimensionalReduction(transform)
}
Initialization::Value(value) => YInit::Value(value),
Initialization::Random(maybe_seed) => YInit::Random(maybe_seed),
}
};
let pair_decision = decide_num_pairs(
n,
config.override_neighbors,
config.mid_near_ratio,
config.far_pair_ratio,
)?;
if n - 1 < pair_decision.n_neighbors {
warn!("Sample size is smaller than n_neighbors. n_neighbors will be reduced.");
}
let pairs = sample_pairs(
x.view(),
pair_decision.n_neighbors,
pair_decision.n_mn,
pair_decision.n_fp,
config.pair_configuration,
config.seed,
config.approx_threshold,
)?;
pacmap(
x.view(),
config.embedding_dimensions,
pairs.pair_neighbors.view(),
pairs.pair_mn.view(),
pairs.pair_fp.view(),
config.learning_rate,
config.num_iters,
embedding_init,
config.snapshots.as_deref(),
)
}
#[allow(dead_code)]
struct PreprocessingResult {
x: Array2<f32>,
pca_solution: bool,
maybe_transform: Option<Transform>,
x_min: f32,
x_max: f32,
x_mean: Array1<f32>,
}
#[non_exhaustive]
enum Transform {
Pca(Pca<f32>),
RandomizedPca(RandomizedPca<f32, Mcg128Xsl64>),
SeededPca(RandomizedPca<f32, SmallRng>),
}
impl Transform {
pub fn transform(&self, x: ArrayView2<f32>) -> Result<Array2<f32>, DecompositionError> {
match self {
Transform::Pca(pca) => pca.transform(&x),
Transform::RandomizedPca(pca) => pca.transform(&x),
Transform::SeededPca(pca) => pca.transform(&x),
}
}
}
fn preprocess_x(
x: ArrayView2<f32>,
apply_pca: bool,
high_dim: usize,
low_dim: usize,
maybe_seed: Option<u64>,
) -> Result<PreprocessingResult, PaCMapError> {
let mut pca_solution = false;
let mut x_out: Array2<f32>;
let x_mean: Array1<f32>;
let x_min: f32;
let x_max: f32;
let mut maybe_transform = None;
if high_dim > 100 && apply_pca {
let n_components = min(100, x.nrows());
x_mean = x.mean_axis(Axis(0)).ok_or(PaCMapError::EmptyArrayMean)?;
match maybe_seed {
None => {
let mut pca = RandomizedPca::new(n_components);
x_out = pca.fit_transform(&x)?;
maybe_transform = Some(Transform::RandomizedPca(pca));
}
Some(seed) => {
let mut pca =
RandomizedPcaBuilder::with_rng(SmallRng::seed_from_u64(seed), n_components)
.build();
x_out = pca.fit_transform(&x)?;
maybe_transform = Some(Transform::SeededPca(pca));
}
};
pca_solution = true;
x_min = 0.0;
x_max = 0.0;
debug!("Applied PCA, the dimensionality becomes {n_components}");
} else {
x_out = x.to_owned();
x_min = *x_out
.iter()
.min_by(|&a, &b| f32::total_cmp(a, b))
.ok_or(PaCMapError::EmptyArrayMinMax)?;
x_max = *x_out
.iter()
.max_by(|&a, &b| f32::total_cmp(a, b))
.ok_or(PaCMapError::EmptyArrayMinMax)?;
x_out.mapv_inplace(|val| val - x_min);
x_out.mapv_inplace(|val| val / x_max);
x_mean = x_out
.mean_axis(Axis(0))
.ok_or(PaCMapError::EmptyArrayMean)?;
x_out -= &x_mean;
if apply_pca {
let n_components = min(x_out.nrows(), low_dim);
let mut pca = Pca::new(n_components);
pca.fit(&x_out)?;
maybe_transform = Some(Transform::Pca(pca));
}
debug!("x is normalized");
};
Ok(PreprocessingResult {
x: x_out,
pca_solution,
x_min,
x_max,
x_mean,
maybe_transform,
})
}
struct PairDecision {
n_neighbors: usize,
n_mn: usize,
n_fp: usize,
}
#[allow(clippy::cast_precision_loss)]
fn decide_num_pairs(
n: usize,
n_neighbors: Option<usize>,
mn_ratio: f32,
fp_ratio: f32,
) -> Result<PairDecision, PaCMapError> {
let n_neighbors = n_neighbors.unwrap_or_else(|| {
if n <= 10000 {
10
} else {
(10.0 + 15.0 * ((n as f32).log10() - 4.0)).round() as usize
}
});
let n_mn = (n_neighbors as f32 * mn_ratio).round() as usize;
let n_fp = (n_neighbors as f32 * fp_ratio).round() as usize;
if n_neighbors < 1 {
return Err(PaCMapError::InvalidNeighborCount);
}
if n_fp < 1 {
return Err(PaCMapError::InvalidFarPointCount);
}
Ok(PairDecision {
n_neighbors,
n_mn,
n_fp,
})
}
struct Pairs {
pair_neighbors: Array2<u32>,
pair_mn: Array2<u32>,
pair_fp: Array2<u32>,
}
fn sample_pairs(
x: ArrayView2<f32>,
n_neighbors: usize,
n_mn: usize,
n_fp: usize,
pair_config: PairConfiguration,
random_state: Option<u64>,
approx_threshold: usize,
) -> Result<Pairs, PaCMapError> {
debug!("Finding pairs");
match pair_config {
PairConfiguration::Generate => Ok(generate_pairs(
x,
n_neighbors,
n_mn,
n_fp,
random_state,
approx_threshold,
)?),
PairConfiguration::NeighborsProvided { pair_neighbors } => {
let expected_shape = [x.nrows() * n_neighbors, 2];
if pair_neighbors.shape() != expected_shape {
return Err(PaCMapError::InvalidNearestNeighborShape {
expected: expected_shape,
actual: pair_neighbors.shape().to_vec(),
});
}
debug!("Using provided nearest neighbor pairs.");
let (pair_mn, pair_fp) = generate_pair_no_neighbors(
x,
n_neighbors,
n_mn,
n_fp,
pair_neighbors.view(),
random_state,
);
debug!("Additional pairs sampled successfully.");
Ok(Pairs {
pair_neighbors,
pair_mn,
pair_fp,
})
}
PairConfiguration::AllProvided {
pair_neighbors,
pair_mn,
pair_fp,
} => {
debug!("Using all provided pairs.");
Ok(Pairs {
pair_neighbors,
pair_mn,
pair_fp,
})
}
}
}
enum YInit {
Value(Array2<f32>),
Preprocessed,
DimensionalReduction(Transform),
Random(Option<u64>),
}
#[allow(clippy::too_many_arguments)]
fn pacmap<'a>(
x: ArrayView2<f32>,
n_dims: usize,
pair_neighbors: ArrayView2<'a, u32>,
pair_mn: ArrayView2<'a, u32>,
pair_fp: ArrayView2<'a, u32>,
lr: f32,
num_iters: (usize, usize, usize),
y_initialization: YInit,
inter_snapshots: Option<&[usize]>,
) -> Result<(Array2<f32>, Option<Array3<f32>>), PaCMapError> {
let start_time = Instant::now();
let n = x.nrows();
let mut inter_snapshots = Snapshots::from(n_dims, n, inter_snapshots);
let mut y: Array2<f32> = match y_initialization {
YInit::Value(mut y) => {
let mean = y.mean_axis(Axis(0)).ok_or(PaCMapError::EmptyArrayMean)?;
let std = y.std_axis(Axis(0), 0.0);
Zip::from(&mut y)
.and_broadcast(&mean)
.and_broadcast(&std)
.par_for_each(|y_elem, &mean_elem, &std| {
*y_elem = (*y_elem - mean_elem) * 0.0001 / std;
});
y
}
YInit::Preprocessed => x.slice(s![.., ..n_dims]).to_owned() * 0.01,
YInit::DimensionalReduction(transform) => transform.transform(x)? * 0.01,
YInit::Random(maybe_seed) => {
let normal = Normal::new(0.0, 1.0)?;
match maybe_seed {
None => Array2::random((n, n_dims), normal),
Some(seed) => {
Array2::random_using((n, n_dims), normal, &mut SmallRng::seed_from_u64(seed))
}
}
}
};
let w_mn_init = 1000.0;
let beta1 = 0.9;
let beta2 = 0.999;
let mut m = Array2::zeros(y.dim());
let mut v = Array2::zeros(y.dim());
if let Some(ref mut snapshots) = inter_snapshots {
snapshots.states.slice_mut(s![0_usize, .., ..]).assign(&y);
}
debug!(
"Pair shapes: neighbors {:?}, MN {:?}, FP {:?}",
pair_neighbors.dim(),
pair_mn.dim(),
pair_fp.dim()
);
let num_iters_total = num_iters.0 + num_iters.1 + num_iters.2;
for itr in 0..num_iters_total {
let weights = find_weights(w_mn_init, itr, num_iters.0, num_iters.1);
let grad = pacmap_grad(y.view(), pair_neighbors, pair_mn, pair_fp, &weights);
let c = grad[(n, 0)];
if itr == 0 {
debug!("Initial Loss: {}", c);
}
update_embedding_adam(
y.view_mut(),
grad.view(),
m.view_mut(),
v.view_mut(),
beta1,
beta2,
lr,
itr,
);
if (itr + 1) % 10 == 0 {
debug!("Iteration: {:4}, Loss: {}", itr + 1, c);
}
let Some(ref mut snapshots) = &mut inter_snapshots else {
continue;
};
if let Some(index) = snapshots.indices.iter().position(|&x| x == itr + 1) {
snapshots.states.slice_mut(s![index, .., ..]).assign(&y);
}
}
let elapsed = start_time.elapsed();
debug!("Elapsed time: {:.2?}", elapsed);
Ok((y, inter_snapshots.map(|s| s.states)))
}
struct Snapshots<'a> {
states: Array3<f32>,
indices: &'a [usize],
}
impl<'a> Snapshots<'a> {
fn from(n_dims: usize, n: usize, maybe_snapshots: Option<&'a [usize]>) -> Option<Self> {
let snapshots = maybe_snapshots?;
Some(Self {
states: Array3::zeros((snapshots.len(), n, n_dims)),
indices: snapshots,
})
}
}
#[derive(Error, Debug)]
#[non_exhaustive]
pub enum PaCMapError {
#[error("Sample size must be larger than one")]
SampleSize,
#[error("Invalid shape for nearest neighbor pairs. Expected {expected:?}, got {actual:?}")]
InvalidNearestNeighborShape {
expected: [usize; 2],
actual: Vec<usize>,
},
#[error("Failed to calculate mean axis: the array is empty")]
EmptyArrayMean,
#[error(transparent)]
Normal(#[from] NormalError),
#[error("The number of nearest neighbors can't be less than 1")]
InvalidNeighborCount,
#[error("The number of far points can't be less than 1")]
InvalidFarPointCount,
#[error("Failed to compute min or max of X: the array is empty")]
EmptyArrayMinMax,
#[error("The range of X is zero (max - min = 0), cannot normalize")]
ZeroRange,
#[error(transparent)]
Pca(#[from] DecompositionError),
#[error(transparent)]
Neighbors(#[from] KnnError),
#[error("Unable to perform PCA; transform not initialized")]
MissingTransform,
}