use ndarray::{Array1, Array2};
pub struct PcaConfig {
pub n_components: usize,
pub max_iterations: usize,
pub tolerance: f64,
pub center: bool,
pub scale: bool,
}
impl Default for PcaConfig {
fn default() -> Self {
Self {
n_components: 2,
max_iterations: 100,
tolerance: 1e-6,
center: true,
scale: false,
}
}
}
pub struct PcaResult {
pub components: Vec<Vec<f64>>,
pub explained_variance: Vec<f64>,
pub explained_variance_ratio: Vec<f64>,
pub mean: Vec<f64>,
pub std_dev: Vec<f64>,
pub n_samples: usize,
pub n_features: usize,
pub iterations_used: usize,
}
impl PcaResult {
pub fn transform(&self, data: &[Vec<f64>]) -> Vec<Vec<f64>> {
data.iter().map(|row| self.transform_one(row)).collect()
}
pub fn transform_one(&self, point: &[f64]) -> Vec<f64> {
let k = self.components.len();
let d = self.n_features;
let mut result = vec![0.0; k];
for (c, component) in self.components.iter().enumerate() {
let mut dot = 0.0;
for j in 0..d {
let mut val = point[j] - self.mean[j];
if self.std_dev[j] > 0.0 {
val /= self.std_dev[j];
}
dot += val * component[j];
}
result[c] = dot;
}
result
}
}
pub fn pca(data: &[Vec<f64>], config: PcaConfig) -> PcaResult {
let n = data.len();
assert!(n > 0, "PCA requires at least one data point");
let d = data[0].len();
assert!(d > 0, "PCA requires at least one feature");
let k = config.n_components.min(d).min(n);
let mut mat = Array2::<f64>::zeros((n, d));
for (i, row) in data.iter().enumerate() {
assert_eq!(row.len(), d, "All rows must have the same number of features");
for (j, &val) in row.iter().enumerate() {
mat[[i, j]] = val;
}
}
let mut mean = vec![0.0; d];
if config.center {
for j in 0..d {
let mut s = 0.0;
for i in 0..n {
s += mat[[i, j]];
}
mean[j] = s / n as f64;
}
for i in 0..n {
for j in 0..d {
mat[[i, j]] -= mean[j];
}
}
}
let mut std_dev = vec![1.0; d];
if config.scale {
for j in 0..d {
let mut ss = 0.0;
for i in 0..n {
ss += mat[[i, j]] * mat[[i, j]];
}
let s = (ss / (n.max(2) - 1) as f64).sqrt();
std_dev[j] = s;
if s > 0.0 {
for i in 0..n {
mat[[i, j]] /= s;
}
}
}
}
let xt = mat.t();
let denom = if n > 1 { (n - 1) as f64 } else { 1.0 };
let mut cov = xt.dot(&mat) / denom;
let mut components: Vec<Vec<f64>> = Vec::with_capacity(k);
let mut eigenvalues: Vec<f64> = Vec::with_capacity(k);
let mut last_iters = 0;
for _ in 0..k {
let (eigvec, eigval, iters) = power_iteration(&cov, config.max_iterations, config.tolerance);
last_iters = iters;
for r in 0..d {
for c in 0..d {
cov[[r, c]] -= eigval * eigvec[r] * eigvec[c];
}
}
components.push(eigvec);
eigenvalues.push(eigval);
}
let total_variance: f64 = eigenvalues.iter().sum::<f64>()
+ (0..d).map(|i| cov[[i, i]]).sum::<f64>(); let explained_variance_ratio: Vec<f64> = eigenvalues
.iter()
.map(|&ev| if total_variance > 0.0 { ev / total_variance } else { 0.0 })
.collect();
PcaResult {
components,
explained_variance: eigenvalues,
explained_variance_ratio,
mean,
std_dev,
n_samples: n,
n_features: d,
iterations_used: last_iters,
}
}
fn power_iteration(
matrix: &Array2<f64>,
max_iters: usize,
tolerance: f64,
) -> (Vec<f64>, f64, usize) {
let d = matrix.nrows();
let mut v = Array1::<f64>::zeros(d);
for i in 0..d {
v[i] = ((i + 1) as f64).sqrt();
}
let norm = v.dot(&v).sqrt();
if norm > 0.0 {
v /= norm;
}
let mut iters = 0;
for iter in 0..max_iters {
iters = iter + 1;
let w = matrix.dot(&v);
let w_norm = w.dot(&w).sqrt();
if w_norm < 1e-15 {
break;
}
let v_new = &w / w_norm;
let diff_pos: f64 = v_new.iter().zip(v.iter()).map(|(a, b)| (a - b).powi(2)).sum();
let diff_neg: f64 = v_new.iter().zip(v.iter()).map(|(a, b)| (a + b).powi(2)).sum();
let diff = diff_pos.min(diff_neg).sqrt();
v = v_new;
if diff < tolerance {
break;
}
}
let cv = matrix.dot(&v);
let eigenvalue = v.dot(&cv);
(v.to_vec(), eigenvalue, iters)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pca_basic() {
let data: Vec<Vec<f64>> = (0..100)
.map(|i| {
let x = i as f64;
let y = x + (i as f64 * 0.1).sin() * 2.0; vec![x, y]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 2,
..Default::default()
});
assert_eq!(result.components.len(), 2);
assert_eq!(result.explained_variance.len(), 2);
assert!(result.explained_variance_ratio[0] > 0.9,
"First component should explain >90% variance, got {}",
result.explained_variance_ratio[0]);
let c0 = &result.components[0];
let angle = (c0[0].abs() - c0[1].abs()).abs();
assert!(angle < 0.2, "First component should be near diagonal, got {:?}", c0);
}
#[test]
fn test_pca_identity() {
let data: Vec<Vec<f64>> = (0..50)
.map(|i| {
vec![
i as f64,
(i as f64 * 0.5).sin() * 10.0,
(i * i) as f64 % 17.0,
]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 3,
..Default::default()
});
let total: f64 = result.explained_variance_ratio.iter().sum();
assert!((total - 1.0).abs() < 0.01,
"Total explained variance ratio should be ~1.0, got {}", total);
}
#[test]
fn test_pca_explained_variance_ratios_sum() {
let data: Vec<Vec<f64>> = (0..200)
.map(|i| {
let x = i as f64 * 0.1;
vec![x, x * 2.0 + 1.0, x.sin(), x.cos(), (x * 0.3).exp().min(100.0)]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 5,
..Default::default()
});
let total: f64 = result.explained_variance_ratio.iter().sum();
assert!((total - 1.0).abs() < 0.05,
"Ratios should sum to ~1.0, got {}", total);
for i in 1..result.explained_variance_ratio.len() {
assert!(result.explained_variance_ratio[i] <= result.explained_variance_ratio[i - 1] + 1e-10,
"Ratios should be descending");
}
}
#[test]
fn test_pca_transform() {
let data: Vec<Vec<f64>> = (0..100)
.map(|i| {
let x = i as f64;
vec![x, x * 1.5 + 3.0, x * 0.8 - 2.0]
})
.collect();
let result = pca(&data, PcaConfig {
n_components: 1,
..Default::default()
});
let projected = result.transform(&data);
assert_eq!(projected.len(), 100);
assert_eq!(projected[0].len(), 1);
assert!(result.explained_variance_ratio[0] > 0.99,
"Should explain >99% variance for perfectly correlated data, got {}",
result.explained_variance_ratio[0]);
}
#[test]
fn test_pca_centering() {
let offset = 1000.0;
let data: Vec<Vec<f64>> = (0..50)
.map(|i| vec![i as f64 + offset, (i as f64 * 2.0) + offset])
.collect();
let result = pca(&data, PcaConfig {
n_components: 2,
center: true,
..Default::default()
});
assert!((result.mean[0] - (offset + 24.5)).abs() < 0.01);
assert!((result.mean[1] - (offset + 49.0)).abs() < 0.01);
assert!(result.explained_variance_ratio[0] > 0.99);
}
#[test]
fn test_pca_convergence() {
let data: Vec<Vec<f64>> = (0..20)
.map(|i| vec![i as f64, 0.0])
.collect();
let result = pca(&data, PcaConfig {
n_components: 1,
max_iterations: 100,
tolerance: 1e-10,
..Default::default()
});
assert!(result.iterations_used < 50,
"Should converge quickly, used {} iterations", result.iterations_used);
let c0 = &result.components[0];
assert!(c0[0].abs() > 0.99, "Should be along x axis, got {:?}", c0);
assert!(c0[1].abs() < 0.1, "Should have near-zero y component, got {:?}", c0);
}
#[test]
fn test_pca_scaling() {
let data: Vec<Vec<f64>> = (0..100)
.map(|i| vec![i as f64, i as f64 * 1000.0])
.collect();
let result_no_scale = pca(&data, PcaConfig {
n_components: 2,
scale: false,
..Default::default()
});
assert!(result_no_scale.components[0][1].abs() > result_no_scale.components[0][0].abs());
let result_scaled = pca(&data, PcaConfig {
n_components: 2,
scale: true,
..Default::default()
});
let ratio = result_scaled.components[0][0].abs() / result_scaled.components[0][1].abs();
assert!(ratio > 0.5 && ratio < 2.0,
"Scaled components should be balanced, ratio = {}", ratio);
}
}