use ndarray::{Array1, Array2, Axis};
use rand::Rng;
#[derive(Debug, Clone)]
pub enum PcaSolver {
Auto,
Randomized {
n_oversamples: usize,
n_power_iters: usize,
},
PowerIteration,
}
impl Default for PcaSolver {
fn default() -> Self {
PcaSolver::Auto
}
}
pub struct PcaConfig {
pub n_components: usize,
pub max_iterations: usize,
pub tolerance: f64,
pub center: bool,
pub scale: bool,
pub solver: PcaSolver,
}
impl Default for PcaConfig {
fn default() -> Self {
Self {
n_components: 2,
max_iterations: 100,
tolerance: 1e-6,
center: true,
scale: false,
solver: PcaSolver::Auto,
}
}
}
pub struct PcaResult {
pub components: Vec<Vec<f64>>,
pub explained_variance: Vec<f64>,
pub explained_variance_ratio: Vec<f64>,
pub mean: Vec<f64>,
pub std_dev: Vec<f64>,
pub n_samples: usize,
pub n_features: usize,
pub iterations_used: usize,
}
impl PcaResult {
pub fn transform(&self, data: &[Vec<f64>]) -> Vec<Vec<f64>> {
let n = data.len();
if n == 0 || self.components.is_empty() {
return vec![];
}
let d = self.n_features;
let k = self.components.len();
let comp_flat: Vec<f64> = self.components.iter().flat_map(|r| r.iter().copied()).collect();
let comp_mat = Array2::from_shape_vec((k, d), comp_flat).unwrap();
let data_flat: Vec<f64> = data.iter().flat_map(|row| {
row.iter().enumerate().map(|(j, &val)| {
let mut v = val - self.mean[j];
if self.std_dev[j] > 0.0 && self.std_dev[j] != 1.0 {
v /= self.std_dev[j];
}
v
})
}).collect();
let data_mat = Array2::from_shape_vec((n, d), data_flat).unwrap();
let projected = data_mat.dot(&comp_mat.t());
projected.rows().into_iter()
.map(|row| row.to_vec())
.collect()
}
pub fn transform_one(&self, point: &[f64]) -> Vec<f64> {
let k = self.components.len();
let d = self.n_features;
let mut result = vec![0.0; k];
for (c, component) in self.components.iter().enumerate() {
let mut dot = 0.0;
for j in 0..d {
let mut val = point[j] - self.mean[j];
if self.std_dev[j] > 0.0 && self.std_dev[j] != 1.0 {
val /= self.std_dev[j];
}
dot += val * component[j];
}
result[c] = dot;
}
result
}
}
pub fn pca(data: &[Vec<f64>], config: PcaConfig) -> PcaResult {
let n = data.len();
assert!(n > 0, "PCA requires at least one data point");
let d = data[0].len();
assert!(d > 0, "PCA requires at least one feature");
let k = config.n_components.min(d).min(n);
let flat: Vec<f64> = data.iter().flat_map(|row| {
assert_eq!(row.len(), d, "All rows must have the same number of features");
row.iter().copied()
}).collect();
let mut mat = Array2::from_shape_vec((n, d), flat).unwrap();
let mut mean = vec![0.0; d];
if config.center {
let mean_arr = mat.mean_axis(Axis(0)).unwrap();
mat -= &mean_arr;
mean = mean_arr.to_vec();
}
let mut std_dev = vec![1.0; d];
if config.scale {
for j in 0..d {
let col = mat.column(j);
let ss: f64 = col.iter().map(|x| x * x).sum();
let s = (ss / (n.max(2) - 1) as f64).sqrt();
std_dev[j] = s;
if s > 0.0 {
mat.column_mut(j).mapv_inplace(|x| x / s);
}
}
}
let use_randomized = match &config.solver {
PcaSolver::Auto => {
let min_dim = n.min(d);
n > 500 && (k as f64) < 0.8 * min_dim as f64
}
PcaSolver::Randomized { .. } => true,
PcaSolver::PowerIteration => false,
};
let (n_oversamples, n_power_iters) = match &config.solver {
PcaSolver::Randomized { n_oversamples, n_power_iters } => (*n_oversamples, *n_power_iters),
_ => (10, 4),
};
if use_randomized {
let (components, singular_values) = randomized_svd(&mat, k, n_oversamples, n_power_iters);
let denom = if n > 1 { (n - 1) as f64 } else { 1.0 };
let eigenvalues: Vec<f64> = singular_values.iter().map(|&s| s * s / denom).collect();
let total_variance = compute_total_variance(&mat, denom);
let explained_variance_ratio: Vec<f64> = eigenvalues
.iter()
.map(|&ev| if total_variance > 0.0 { ev / total_variance } else { 0.0 })
.collect();
PcaResult {
components,
explained_variance: eigenvalues,
explained_variance_ratio,
mean,
std_dev,
n_samples: n,
n_features: d,
iterations_used: n_power_iters,
}
} else {
let xt = mat.t();
let denom = if n > 1 { (n - 1) as f64 } else { 1.0 };
let mut cov = xt.dot(&mat) / denom;
let total_variance: f64 = (0..d).map(|i| cov[[i, i]]).sum();
let mut components: Vec<Vec<f64>> = Vec::with_capacity(k);
let mut eigenvalues: Vec<f64> = Vec::with_capacity(k);
let mut last_iters = 0;
for _ in 0..k {
let (eigvec, _eigval, iters) = power_iteration(&cov, config.max_iterations, config.tolerance);
last_iters = iters;
let mut new_vec = eigvec;
for prev in &components {
let dot: f64 = new_vec.iter().zip(prev).map(|(a, b)| a * b).sum();
for (v, p) in new_vec.iter_mut().zip(prev) {
*v -= dot * p;
}
}
let norm: f64 = new_vec.iter().map(|x| x * x).sum::<f64>().sqrt();
if norm > 1e-15 {
for v in &mut new_vec {
*v /= norm;
}
}
let v_arr = Array1::from(new_vec.clone());
let cv = cov.dot(&v_arr);
let eigval = v_arr.dot(&cv);
for r in 0..d {
for c in 0..d {
cov[[r, c]] -= eigval * new_vec[r] * new_vec[c];
}
}
components.push(new_vec);
eigenvalues.push(eigval);
}
let explained_variance_ratio: Vec<f64> = eigenvalues
.iter()
.map(|&ev| if total_variance > 0.0 { ev / total_variance } else { 0.0 })
.collect();
PcaResult {
components,
explained_variance: eigenvalues,
explained_variance_ratio,
mean,
std_dev,
n_samples: n,
n_features: d,
iterations_used: last_iters,
}
}
}
fn compute_total_variance(mat: &Array2<f64>, denom: f64) -> f64 {
let d = mat.ncols();
let mut total = 0.0;
for j in 0..d {
let col = mat.column(j);
let ss: f64 = col.iter().map(|x| x * x).sum();
total += ss / denom;
}
total
}
fn randomized_svd(
mat: &Array2<f64>, k: usize, n_oversamples: usize, n_power_iters: usize, ) -> (Vec<Vec<f64>>, Vec<f64>) {
let n = mat.nrows();
let d = mat.ncols();
let l = (k + n_oversamples).min(n).min(d);
let mut rng = rand::thread_rng();
let omega_flat: Vec<f64> = (0..d * l).map(|_| rng.sample::<f64, _>(rand::distributions::Standard)).collect();
let omega = Array2::from_shape_vec((d, l), omega_flat).unwrap();
let mut y = mat.dot(&omega);
for _ in 0..n_power_iters {
qr_modified_gram_schmidt(&mut y);
let xty = mat.t().dot(&y); y = mat.dot(&xty); }
qr_modified_gram_schmidt(&mut y);
let q = y;
let b = q.t().dot(mat);
let bbt = b.dot(&b.t());
let l_actual = bbt.nrows();
let (eigvecs_left, eigvals) = symmetric_eigen(&bbt, l_actual);
let mut components = Vec::with_capacity(k);
let mut singular_values = Vec::with_capacity(k);
for i in 0..k.min(l_actual) {
let sigma_sq = eigvals[i];
if sigma_sq < 1e-30 {
break;
}
let sigma = sigma_sq.sqrt();
singular_values.push(sigma);
let u_i = eigvecs_left.column(i);
let v_i = b.t().dot(&u_i) / sigma;
components.push(v_i.to_vec());
}
(components, singular_values)
}
fn symmetric_eigen(mat: &Array2<f64>, k: usize) -> (Array2<f64>, Vec<f64>) {
let d = mat.nrows();
let mut deflated = mat.clone();
let mut eigvecs = Array2::<f64>::zeros((d, k));
let mut eigvals = Vec::with_capacity(k);
for i in 0..k {
let (vec, _val, _) = power_iteration(&deflated, 200, 1e-12);
let mut v = Array1::from(vec);
for j in 0..i {
let prev = eigvecs.column(j);
let dot = prev.dot(&v);
v -= &(&prev * dot);
}
let norm = v.dot(&v).sqrt();
if norm > 1e-15 {
v /= norm;
}
let av = deflated.dot(&v);
let val = v.dot(&av);
for r in 0..d {
for c in 0..d {
deflated[[r, c]] -= val * v[r] * v[c];
}
}
eigvecs.column_mut(i).assign(&v);
eigvals.push(val);
}
(eigvecs, eigvals)
}
fn qr_modified_gram_schmidt(mat: &mut Array2<f64>) {
let ncols = mat.ncols();
for j in 0..ncols {
let mut col_j = mat.column(j).to_owned();
for i in 0..j {
let col_i = mat.column(i).to_owned();
let r = col_i.dot(&col_j);
col_j -= &(&col_i * r);
}
let norm = col_j.dot(&col_j).sqrt();
if norm > 1e-15 {
col_j /= norm;
}
mat.column_mut(j).assign(&col_j);
}
}
fn power_iteration(
matrix: &Array2<f64>,
max_iters: usize,
tolerance: f64,
) -> (Vec<f64>, f64, usize) {
let d = matrix.nrows();
let mut v = Array1::<f64>::zeros(d);
for i in 0..d {
v[i] = ((i + 1) as f64).sqrt();
}
let norm = v.dot(&v).sqrt();
if norm > 0.0 {
v /= norm;
}
let mut iters = 0;
for iter in 0..max_iters {
iters = iter + 1;
let w = matrix.dot(&v);
let w_norm = w.dot(&w).sqrt();
if w_norm < 1e-15 {
break;
}
let v_new = &w / w_norm;
let diff_pos: f64 = v_new.iter().zip(v.iter()).map(|(a, b)| (a - b).powi(2)).sum();
let diff_neg: f64 = v_new.iter().zip(v.iter()).map(|(a, b)| (a + b).powi(2)).sum();
let diff = diff_pos.min(diff_neg).sqrt();
v = v_new;
if diff < tolerance {
break;
}
}
let cv = matrix.dot(&v);
let eigenvalue = v.dot(&cv);
(v.to_vec(), eigenvalue, iters)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pca_basic() {
let data: Vec<Vec<f64>> = (0..100)
.map(|i| {
let x = i as f64;
let y = x + (i as f64 * 0.1).sin() * 2.0; vec![x, y]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 2,
..Default::default()
});
assert_eq!(result.components.len(), 2);
assert_eq!(result.explained_variance.len(), 2);
assert!(result.explained_variance_ratio[0] > 0.9,
"First component should explain >90% variance, got {}",
result.explained_variance_ratio[0]);
let c0 = &result.components[0];
let angle = (c0[0].abs() - c0[1].abs()).abs();
assert!(angle < 0.2, "First component should be near diagonal, got {:?}", c0);
}
#[test]
fn test_pca_identity() {
let data: Vec<Vec<f64>> = (0..50)
.map(|i| {
vec![
i as f64,
(i as f64 * 0.5).sin() * 10.0,
(i * i) as f64 % 17.0,
]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 3,
..Default::default()
});
let total: f64 = result.explained_variance_ratio.iter().sum();
assert!((total - 1.0).abs() < 0.01,
"Total explained variance ratio should be ~1.0, got {}", total);
}
#[test]
fn test_pca_explained_variance_ratios_sum() {
let data: Vec<Vec<f64>> = (0..200)
.map(|i| {
let x = i as f64 * 0.1;
vec![x, x * 2.0 + 1.0, x.sin(), x.cos(), (x * 0.3).exp().min(100.0)]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 5,
..Default::default()
});
let total: f64 = result.explained_variance_ratio.iter().sum();
assert!((total - 1.0).abs() < 0.05,
"Ratios should sum to ~1.0, got {}", total);
for i in 1..result.explained_variance_ratio.len() {
assert!(result.explained_variance_ratio[i] <= result.explained_variance_ratio[i - 1] + 1e-10,
"Ratios should be descending");
}
}
#[test]
fn test_pca_transform() {
let data: Vec<Vec<f64>> = (0..100)
.map(|i| {
let x = i as f64;
vec![x, x * 1.5 + 3.0, x * 0.8 - 2.0]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 1,
..Default::default()
});
let projected = result.transform(&data);
assert_eq!(projected.len(), 100);
assert_eq!(projected[0].len(), 1);
assert!(result.explained_variance_ratio[0] > 0.99,
"Should explain >99% variance for perfectly correlated data, got {}",
result.explained_variance_ratio[0]);
}
#[test]
fn test_pca_centering() {
let offset = 1000.0;
let data: Vec<Vec<f64>> = (0..50)
.map(|i| vec![i as f64 + offset, (i as f64 * 2.0) + offset])
.collect();
let result = pca(&data, PcaConfig {
n_components: 2,
center: true,
..Default::default()
});
assert!((result.mean[0] - (offset + 24.5)).abs() < 0.01);
assert!((result.mean[1] - (offset + 49.0)).abs() < 0.01);
assert!(result.explained_variance_ratio[0] > 0.99);
}
#[test]
fn test_pca_convergence() {
let data: Vec<Vec<f64>> = (0..20)
.map(|i| vec![i as f64, 0.0])
.collect();
let result = pca(&data, PcaConfig {
n_components: 1,
max_iterations: 100,
tolerance: 1e-10,
solver: PcaSolver::PowerIteration,
..Default::default()
});
assert!(result.iterations_used < 50,
"Should converge quickly, used {} iterations", result.iterations_used);
let c0 = &result.components[0];
assert!(c0[0].abs() > 0.99, "Should be along x axis, got {:?}", c0);
assert!(c0[1].abs() < 0.1, "Should have near-zero y component, got {:?}", c0);
}
#[test]
fn test_pca_scaling() {
let data: Vec<Vec<f64>> = (0..100)
.map(|i| vec![i as f64, i as f64 * 1000.0])
.collect();
let result_no_scale = pca(&data, PcaConfig {
n_components: 2,
scale: false,
..Default::default()
});
assert!(result_no_scale.components[0][1].abs() > result_no_scale.components[0][0].abs());
let result_scaled = pca(&data, PcaConfig {
n_components: 2,
scale: true,
..Default::default()
});
let ratio = result_scaled.components[0][0].abs() / result_scaled.components[0][1].abs();
assert!(ratio > 0.5 && ratio < 2.0,
"Scaled components should be balanced, ratio = {}", ratio);
}
#[test]
fn test_pca_randomized_basic() {
let data: Vec<Vec<f64>> = (0..600)
.map(|i| {
let x = i as f64;
vec![x, x * 2.0 + 1.0, x.sin() * 10.0, (x * 0.01).cos() * 5.0]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 2,
solver: PcaSolver::Randomized { n_oversamples: 10, n_power_iters: 4 },
..Default::default()
});
assert_eq!(result.components.len(), 2);
assert!(result.explained_variance_ratio[0] > 0.8,
"Randomized SVD: first component should explain >80% variance, got {}",
result.explained_variance_ratio[0]);
}
#[test]
fn test_pca_randomized_orthogonality() {
let data: Vec<Vec<f64>> = (0..600)
.map(|i| {
let x = i as f64 * 0.1;
vec![x, x * 2.0 + 1.0, x.sin() * 10.0, x.cos() * 5.0, (x * 0.3).exp().min(100.0)]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 4,
solver: PcaSolver::Randomized { n_oversamples: 10, n_power_iters: 4 },
..Default::default()
});
for i in 0..result.components.len() {
for j in (i + 1)..result.components.len() {
let dot: f64 = result.components[i].iter()
.zip(result.components[j].iter())
.map(|(a, b)| a * b)
.sum();
assert!(dot.abs() < 0.05,
"Components {} and {} should be orthogonal, dot product = {}", i, j, dot);
}
}
}
#[test]
fn test_pca_auto_selects_randomized() {
let data: Vec<Vec<f64>> = (0..600)
.map(|i| {
let x = i as f64;
vec![x, x * 2.0, x.sin(), x.cos()]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 2,
solver: PcaSolver::Auto,
..Default::default()
});
assert!(result.iterations_used <= 10,
"Auto should select randomized (iters={})", result.iterations_used);
assert_eq!(result.components.len(), 2);
}
#[test]
fn test_pca_solver_backward_compat() {
let data: Vec<Vec<f64>> = (0..100)
.map(|i| {
let x = i as f64;
vec![x, x * 1.5 + 3.0, x * 0.8 - 2.0]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 2,
solver: PcaSolver::PowerIteration,
..Default::default()
});
assert_eq!(result.components.len(), 2);
assert!(result.explained_variance_ratio[0] > 0.99,
"PowerIteration should explain >99% on correlated data, got {}",
result.explained_variance_ratio[0]);
let dot: f64 = result.components[0].iter()
.zip(result.components[1].iter())
.map(|(a, b)| a * b)
.sum();
assert!(dot.abs() < 0.01,
"PowerIteration components should be orthogonal, dot = {}", dot);
}
#[test]
fn test_pca_randomized_variance_sum() {
let data: Vec<Vec<f64>> = (0..600)
.map(|i| {
let x = i as f64 * 0.1;
vec![x, x.sin() * 10.0, (x * 0.5).cos() * 5.0]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 3,
solver: PcaSolver::Randomized { n_oversamples: 10, n_power_iters: 4 },
..Default::default()
});
let total: f64 = result.explained_variance_ratio.iter().sum();
assert!((total - 1.0).abs() < 0.1,
"Randomized SVD ratios should sum close to 1.0, got {}", total);
}
#[test]
fn test_pca_transform_batch() {
let data: Vec<Vec<f64>> = (0..100)
.map(|i| {
let x = i as f64;
vec![x, x * 2.0 + 1.0, x.sin() * 3.0]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 2,
..Default::default()
});
let batch = result.transform(&data);
for (i, row) in data.iter().enumerate() {
let single = result.transform_one(row);
for (j, (&b, &s)) in batch[i].iter().zip(single.iter()).enumerate() {
assert!((b - s).abs() < 1e-10,
"Batch[{}][{}]={} != single={}", i, j, b, s);
}
}
}
}