use crate::error::InsightError;
use u_numflow::matrix::Matrix;
#[derive(Debug, Clone)]
pub struct PcaConfig {
pub n_components: usize,
pub auto_scale: bool,
}
impl PcaConfig {
pub fn new(n_components: usize) -> Self {
Self {
n_components,
auto_scale: false,
}
}
pub fn auto_scale(mut self, auto_scale: bool) -> Self {
self.auto_scale = auto_scale;
self
}
}
#[derive(Debug, Clone)]
pub struct PcaResult {
pub n_components: usize,
pub eigenvalues: Vec<f64>,
pub explained_variance_ratio: Vec<f64>,
pub cumulative_variance_ratio: Vec<f64>,
pub loadings: Vec<Vec<f64>>,
pub scores: Vec<Vec<f64>>,
pub means: Vec<f64>,
pub stds: Vec<f64>,
pub n_features: usize,
}
pub fn pca(data: &[Vec<f64>], config: &PcaConfig) -> Result<PcaResult, InsightError> {
let n = data.len();
if n < 2 {
return Err(InsightError::InsufficientData {
min_required: 2,
actual: n,
});
}
let d = data[0].len();
if d == 0 {
return Err(InsightError::DegenerateData {
reason: "data has 0 features".into(),
});
}
if config.n_components == 0 || config.n_components > d {
return Err(InsightError::InvalidParameter {
name: "n_components".into(),
message: format!(
"must be between 1 and {d} (number of features), got {}",
config.n_components
),
});
}
for (i, point) in data.iter().enumerate() {
if point.len() != d {
return Err(InsightError::DimensionMismatch {
expected: d,
actual: point.len(),
});
}
for &v in point {
if v.is_nan() || v.is_infinite() {
return Err(InsightError::NonNumericColumn {
column: format!("point[{i}]"),
});
}
}
}
let mut means = vec![0.0; d];
for point in data {
for (j, &v) in point.iter().enumerate() {
means[j] += v;
}
}
for m in &mut means {
*m /= n as f64;
}
let stds = if config.auto_scale {
let mut vars = vec![0.0; d];
for point in data {
for (j, &v) in point.iter().enumerate() {
let diff = v - means[j];
vars[j] += diff * diff;
}
}
vars.iter()
.map(|&v| {
let s = (v / (n - 1) as f64).sqrt();
if s < 1e-15 {
1.0
} else {
s
} })
.collect::<Vec<f64>>()
} else {
vec![1.0; d]
};
let mut centered = vec![vec![0.0; d]; n];
for (i, point) in data.iter().enumerate() {
for j in 0..d {
centered[i][j] = (point[j] - means[j]) / stds[j];
}
}
let mut cov_data = vec![0.0; d * d];
for point in ¢ered {
for i in 0..d {
for j in i..d {
let v = point[i] * point[j];
cov_data[i * d + j] += v;
if i != j {
cov_data[j * d + i] += v;
}
}
}
}
let scale = 1.0 / (n - 1) as f64;
for v in &mut cov_data {
*v *= scale;
}
let cov_matrix = Matrix::new(d, d, cov_data).map_err(|e| InsightError::ComputationFailed {
operation: "covariance matrix construction".into(),
detail: e.to_string(),
})?;
let (eigenvalues, eigenvectors) =
cov_matrix
.eigen_symmetric()
.map_err(|e| InsightError::ComputationFailed {
operation: "eigenvalue decomposition".into(),
detail: e.to_string(),
})?;
let k = config.n_components;
let total_variance: f64 = eigenvalues.iter().sum();
let retained_eigenvalues: Vec<f64> = eigenvalues[..k].to_vec();
let explained_variance_ratio: Vec<f64> = if total_variance > 1e-15 {
retained_eigenvalues
.iter()
.map(|&ev| ev / total_variance)
.collect()
} else {
vec![0.0; k]
};
let mut cumulative_variance_ratio = Vec::with_capacity(k);
let mut cum = 0.0;
for &r in &explained_variance_ratio {
cum += r;
cumulative_variance_ratio.push(cum);
}
let mut loadings = Vec::with_capacity(k);
for comp in 0..k {
let mut loading = Vec::with_capacity(d);
for feat in 0..d {
loading.push(eigenvectors.get(feat, comp));
}
loadings.push(loading);
}
let mut scores = Vec::with_capacity(n);
for point in ¢ered {
let mut score = Vec::with_capacity(k);
for comp_loading in &loadings {
let s: f64 = point
.iter()
.zip(comp_loading.iter())
.map(|(&x, &w)| x * w)
.sum();
score.push(s);
}
scores.push(score);
}
Ok(PcaResult {
n_components: k,
eigenvalues: retained_eigenvalues,
explained_variance_ratio,
cumulative_variance_ratio,
loadings,
scores,
means,
stds,
n_features: d,
})
}
#[cfg(test)]
mod tests {
use super::*;
fn make_linear_3d() -> Vec<Vec<f64>> {
vec![
vec![1.0, 0.1, 0.01],
vec![2.0, 0.2, 0.02],
vec![3.0, 0.3, 0.03],
vec![4.0, 0.4, 0.04],
vec![5.0, 0.5, 0.05],
vec![6.0, 0.6, 0.06],
]
}
fn make_two_dim() -> Vec<Vec<f64>> {
vec![
vec![1.0, 10.0],
vec![2.0, 20.0],
vec![3.0, 30.0],
vec![4.0, 40.0],
vec![5.0, 50.0],
]
}
#[test]
fn pca_basic_3d_to_2d() {
let data = make_linear_3d();
let result = pca(&data, &PcaConfig::new(2)).unwrap();
assert_eq!(result.n_components, 2);
assert_eq!(result.eigenvalues.len(), 2);
assert_eq!(result.explained_variance_ratio.len(), 2);
assert_eq!(result.loadings.len(), 2);
assert_eq!(result.scores.len(), 6);
assert_eq!(result.scores[0].len(), 2);
}
#[test]
fn pca_first_component_dominant() {
let data = make_linear_3d();
let result = pca(&data, &PcaConfig::new(1)).unwrap();
assert!(
result.explained_variance_ratio[0] > 0.99,
"PC1 should explain >99% of variance, got {}",
result.explained_variance_ratio[0]
);
}
#[test]
fn pca_cumulative_variance() {
let data = make_linear_3d();
let result = pca(&data, &PcaConfig::new(3)).unwrap();
let last = *result.cumulative_variance_ratio.last().expect("non-empty");
assert!(
(last - 1.0).abs() < 1e-10,
"cumulative variance should sum to 1.0, got {last}"
);
for i in 1..result.cumulative_variance_ratio.len() {
assert!(result.cumulative_variance_ratio[i] >= result.cumulative_variance_ratio[i - 1]);
}
}
#[test]
fn pca_eigenvalues_descending() {
let data = make_linear_3d();
let result = pca(&data, &PcaConfig::new(3)).unwrap();
for i in 1..result.eigenvalues.len() {
assert!(
result.eigenvalues[i] <= result.eigenvalues[i - 1] + 1e-10,
"eigenvalues should be descending"
);
}
}
#[test]
fn pca_loadings_orthonormal() {
let data = make_linear_3d();
let result = pca(&data, &PcaConfig::new(3)).unwrap();
for (i, loading) in result.loadings.iter().enumerate() {
let norm: f64 = loading.iter().map(|x| x * x).sum::<f64>().sqrt();
assert!(
(norm - 1.0).abs() < 1e-10,
"loading {i} norm = {norm}, expected 1.0"
);
}
for i in 0..result.loadings.len() {
for j in (i + 1)..result.loadings.len() {
let dot: f64 = result.loadings[i]
.iter()
.zip(result.loadings[j].iter())
.map(|(&a, &b)| a * b)
.sum();
assert!(
dot.abs() < 1e-10,
"loadings {i} and {j} not orthogonal: dot = {dot}"
);
}
}
}
#[test]
fn pca_scores_dimensions() {
let data = make_two_dim();
let result = pca(&data, &PcaConfig::new(2)).unwrap();
assert_eq!(result.scores.len(), 5);
for score in &result.scores {
assert_eq!(score.len(), 2);
}
}
#[test]
fn pca_scores_centered() {
let data = make_two_dim();
let result = pca(&data, &PcaConfig::new(2)).unwrap();
let n = result.scores.len();
for comp in 0..result.n_components {
let mean: f64 = result.scores.iter().map(|s| s[comp]).sum::<f64>() / n as f64;
assert!(
mean.abs() < 1e-10,
"scores for component {comp} not centered: mean = {mean}"
);
}
}
#[test]
fn pca_auto_scale_effect() {
let data = vec![
vec![100.0, 1.0],
vec![200.0, 2.0],
vec![300.0, 3.0],
vec![400.0, 4.0],
vec![500.0, 5.0],
];
let r_no_scale = pca(&data, &PcaConfig::new(2)).unwrap();
let r_scaled = pca(&data, &PcaConfig::new(2).auto_scale(true)).unwrap();
assert!(r_no_scale.explained_variance_ratio[0] > 0.99);
assert!((r_scaled.stds[0] - r_scaled.stds[1]).abs() > 1.0);
}
#[test]
fn pca_auto_scale_stds_stored() {
let data = make_two_dim();
let result = pca(&data, &PcaConfig::new(2).auto_scale(true)).unwrap();
assert!(result.stds[0] > 0.0);
assert!(result.stds[1] > 0.0);
}
#[test]
fn pca_no_scale_stds_ones() {
let data = make_two_dim();
let result = pca(&data, &PcaConfig::new(2)).unwrap();
for &s in &result.stds {
assert!((s - 1.0).abs() < 1e-15);
}
}
#[test]
fn pca_empty_data() {
let data: Vec<Vec<f64>> = vec![];
assert!(pca(&data, &PcaConfig::new(1)).is_err());
}
#[test]
fn pca_single_point() {
let data = vec![vec![1.0, 2.0]];
assert!(pca(&data, &PcaConfig::new(1)).is_err());
}
#[test]
fn pca_zero_components() {
let data = make_two_dim();
assert!(pca(&data, &PcaConfig::new(0)).is_err());
}
#[test]
fn pca_too_many_components() {
let data = make_two_dim();
assert!(pca(&data, &PcaConfig::new(3)).is_err());
}
#[test]
fn pca_nan_rejected() {
let data = vec![vec![1.0, f64::NAN], vec![2.0, 3.0]];
assert!(pca(&data, &PcaConfig::new(1)).is_err());
}
#[test]
fn pca_inf_rejected() {
let data = vec![vec![1.0, f64::INFINITY], vec![2.0, 3.0]];
assert!(pca(&data, &PcaConfig::new(1)).is_err());
}
#[test]
fn pca_dimension_mismatch() {
let data = vec![vec![1.0, 2.0], vec![3.0]];
assert!(pca(&data, &PcaConfig::new(1)).is_err());
}
#[test]
fn pca_known_2d_result() {
let data = vec![
vec![1.0, 1.0],
vec![2.0, 2.0],
vec![3.0, 3.0],
vec![4.0, 4.0],
vec![5.0, 5.0],
];
let result = pca(&data, &PcaConfig::new(2)).unwrap();
assert!(
result.explained_variance_ratio[0] > 0.999,
"PC1 should capture all variance"
);
let l = &result.loadings[0];
let expected = 1.0 / 2.0f64.sqrt();
assert!(
(l[0].abs() - expected).abs() < 1e-10,
"loading[0] = {}, expected ±{}",
l[0],
expected
);
assert!(
(l[1].abs() - expected).abs() < 1e-10,
"loading[1] = {}, expected ±{}",
l[1],
expected
);
}
#[test]
fn pca_1d_data() {
let data = vec![vec![1.0], vec![2.0], vec![3.0], vec![4.0]];
let result = pca(&data, &PcaConfig::new(1)).unwrap();
assert_eq!(result.n_components, 1);
assert!((result.explained_variance_ratio[0] - 1.0).abs() < 1e-10);
assert!((result.loadings[0][0].abs() - 1.0).abs() < 1e-10);
}
#[test]
fn pca_high_dim_reduction() {
let mut data = Vec::new();
for i in 0..20 {
let mut point = vec![0.0; 10];
point[0] = i as f64;
point[1] = (i as f64) * 0.5 + 1.0;
data.push(point);
}
let result = pca(&data, &PcaConfig::new(2)).unwrap();
assert!(
result.cumulative_variance_ratio[1] > 0.999,
"first 2 PCs should explain >99.9% of variance, got {}",
result.cumulative_variance_ratio[1]
);
}
#[test]
fn pca_means_stored() {
let data = make_two_dim();
let result = pca(&data, &PcaConfig::new(2)).unwrap();
assert!((result.means[0] - 3.0).abs() < 1e-10);
assert!((result.means[1] - 30.0).abs() < 1e-10);
}
}