use anofox_ml_core::{FitUnsupervised, Float, InverseTransform, Result, RustMlError, Transform};
use ndarray::{Array1, Array2, Axis};
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Pca {
pub n_components: usize,
}
impl Pca {
pub fn new(n_components: usize) -> Self {
Self { n_components }
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
#[serde(bound(deserialize = "F: serde::de::DeserializeOwned"))]
pub struct FittedPca<F: Float> {
components: Array2<F>,
explained_variance: Array1<F>,
mean: Array1<F>,
}
const POWER_ITER_STEPS: usize = 200;
impl<F: Float> FitUnsupervised<F> for Pca {
type Fitted = FittedPca<F>;
fn fit(&self, x: &Array2<F>) -> Result<Self::Fitted> {
let (n_samples, n_features) = x.dim();
if n_samples == 0 || n_features == 0 {
return Err(RustMlError::EmptyInput("input array is empty".into()));
}
if self.n_components == 0 {
return Err(RustMlError::InvalidParameter(
"n_components must be at least 1".into(),
));
}
if self.n_components > n_features {
return Err(RustMlError::InvalidParameter(format!(
"n_components ({}) must be <= n_features ({})",
self.n_components, n_features
)));
}
if n_samples < 2 {
return Err(RustMlError::InvalidParameter(
"PCA requires at least 2 samples to compute covariance".into(),
));
}
let n_f = F::from_usize(n_samples).unwrap();
let mean = x.sum_axis(Axis(0)) / n_f;
let x_centered = x - &mean;
let n_minus_1 = F::from_usize(n_samples - 1).unwrap();
let mut cov = x_centered.t().dot(&x_centered);
cov.mapv_inplace(|v| v / n_minus_1);
let mut components = Array2::<F>::zeros((self.n_components, n_features));
let mut explained_variance = Array1::<F>::zeros(self.n_components);
let eps = F::from_f64(1e-12).unwrap();
for k in 0..self.n_components {
let mut v = Array1::<F>::zeros(n_features);
for i in 0..n_features {
v[i] = F::from_usize(i + 1).unwrap();
}
for prev in 0..k {
let prev_comp = components.row(prev);
let dot = v.dot(&prev_comp);
v.scaled_add(-dot, &prev_comp);
}
let norm = v.dot(&v).sqrt();
if norm < eps {
explained_variance[k] = F::zero();
for basis_idx in 0..n_features {
v = Array1::<F>::zeros(n_features);
v[basis_idx] = F::one();
for prev in 0..k {
let prev_comp = components.row(prev);
let dot = v.dot(&prev_comp);
v.scaled_add(-dot, &prev_comp);
}
let n2 = v.dot(&v).sqrt();
if n2 > eps {
v.mapv_inplace(|vi| vi / n2);
break;
}
}
components.row_mut(k).assign(&v);
continue;
}
v.mapv_inplace(|vi| vi / norm);
let convergence_tol = F::from_f64(1e-12).unwrap();
for _ in 0..POWER_ITER_STEPS {
let mut w = cov.dot(&v);
for prev in 0..k {
let prev_comp = components.row(prev);
let dot = w.dot(&prev_comp);
w.scaled_add(-dot, &prev_comp);
}
let w_norm = w.dot(&w).sqrt();
if w_norm < F::from_f64(1e-30).unwrap() {
break;
}
let v_new = w.mapv(|wi| wi / w_norm);
let diff_vec = &v_new - &v;
let diff = diff_vec.dot(&diff_vec);
v = v_new;
if diff < convergence_tol {
break;
}
}
let cv = cov.dot(&v);
let eigenvalue = v.dot(&cv);
let eigenvalue = if eigenvalue < F::zero() {
F::zero()
} else {
eigenvalue
};
let v_col = v.view().insert_axis(Axis(1));
let v_row = v.view().insert_axis(Axis(0));
cov -= &(v_col.dot(&v_row) * eigenvalue);
components.row_mut(k).assign(&v);
explained_variance[k] = eigenvalue;
}
Ok(FittedPca {
components,
explained_variance,
mean,
})
}
}
impl<F: Float> Transform<F> for FittedPca<F> {
fn transform(&self, x: &Array2<F>) -> Result<Array2<F>> {
let n_features = self.mean.len();
if x.ncols() != n_features {
return Err(RustMlError::ShapeMismatch(format!(
"expected {} features, got {}",
n_features,
x.ncols()
)));
}
let centered = x - &self.mean;
Ok(centered.dot(&self.components.t()))
}
}
impl<F: Float> InverseTransform<F> for FittedPca<F> {
fn inverse_transform(&self, x: &Array2<F>) -> Result<Array2<F>> {
let n_components = self.components.nrows();
if x.ncols() != n_components {
return Err(RustMlError::ShapeMismatch(format!(
"expected {} components, got {}",
n_components,
x.ncols()
)));
}
Ok(x.dot(&self.components) + &self.mean)
}
}
impl<F: Float> FittedPca<F> {
pub fn components(&self) -> &Array2<F> {
&self.components
}
pub fn explained_variance(&self) -> &Array1<F> {
&self.explained_variance
}
pub fn mean(&self) -> &Array1<F> {
&self.mean
}
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
use ndarray::array;
#[test]
fn test_first_component_captures_most_variance() {
let x = array![
[1.0, 1.0],
[2.0, 2.1],
[3.0, 2.9],
[4.0, 4.0],
[5.0, 5.1],
[6.0, 5.9],
[7.0, 7.0],
[8.0, 8.1],
];
let pca = Pca { n_components: 2 };
let fitted = FitUnsupervised::<f64>::fit(&pca, &x).unwrap();
let var = fitted.explained_variance();
let total: f64 = var.iter().copied().sum();
let ratio = var[0] / total;
assert!(
ratio > 0.95,
"first component should capture >95% variance, got {:.4}",
ratio
);
}
#[test]
fn test_transform_inverse_transform_roundtrip() {
let x = array![
[1.0, 2.0, 3.0],
[4.0, 5.0, 6.0],
[7.0, 8.0, 9.0],
[10.0, 11.0, 12.0],
];
let pca = Pca { n_components: 3 };
let fitted = FitUnsupervised::<f64>::fit(&pca, &x).unwrap();
let transformed = fitted.transform(&x).unwrap();
let recovered = fitted.inverse_transform(&transformed).unwrap();
for (a, b) in x.iter().zip(recovered.iter()) {
assert_abs_diff_eq!(a, b, epsilon = 1e-8);
}
}
#[test]
fn test_transform_inverse_transform_lossy() {
let x = array![
[1.0, 2.0, 0.5],
[2.0, 4.0, 1.0],
[3.0, 6.0, 1.5],
[4.0, 8.0, 2.0],
[5.0, 10.0, 2.5],
];
let pca = Pca { n_components: 1 };
let fitted = FitUnsupervised::<f64>::fit(&pca, &x).unwrap();
let transformed = fitted.transform(&x).unwrap();
let recovered = fitted.inverse_transform(&transformed).unwrap();
for (a, b) in x.iter().zip(recovered.iter()) {
assert_abs_diff_eq!(a, b, epsilon = 0.1);
}
}
#[test]
fn test_explained_variance_sorted_descending() {
let x = array![
[1.0, 0.5, 0.1],
[2.0, 1.0, 0.3],
[3.0, 1.4, 0.2],
[4.0, 2.1, 0.5],
[5.0, 2.5, 0.8],
[6.0, 3.2, 0.4],
[7.0, 3.6, 0.9],
];
let pca = Pca { n_components: 3 };
let fitted = FitUnsupervised::<f64>::fit(&pca, &x).unwrap();
let var = fitted.explained_variance();
for (i, &v) in var.iter().enumerate() {
assert!(v >= 0.0, "explained_variance[{}] = {} is negative", i, v);
}
for i in 1..var.len() {
assert!(
var[i - 1] >= var[i],
"explained_variance not sorted descending: var[{}]={} < var[{}]={}",
i - 1,
var[i - 1],
i,
var[i]
);
}
}
#[test]
fn test_n_components_exceeds_n_features() {
let x = array![[1.0, 2.0], [3.0, 4.0]];
let pca = Pca { n_components: 5 };
let result = FitUnsupervised::<f64>::fit(&pca, &x);
assert!(result.is_err());
let err = result.unwrap_err();
match err {
RustMlError::InvalidParameter(msg) => {
assert!(
msg.contains("n_components"),
"error should mention n_components: {}",
msg
);
}
other => panic!("expected InvalidParameter, got {:?}", other),
}
}
#[test]
fn test_components_are_unit_vectors() {
let x = array![
[1.0, 2.0, 3.0],
[4.0, 5.0, 6.0],
[7.0, 8.0, 9.0],
[10.0, 11.0, 12.0],
];
let pca = Pca { n_components: 2 };
let fitted = FitUnsupervised::<f64>::fit(&pca, &x).unwrap();
for row in fitted.components().rows() {
let norm: f64 = row.iter().map(|&v| v * v).sum::<f64>().sqrt();
assert_abs_diff_eq!(norm, 1.0, epsilon = 1e-10);
}
}
#[test]
fn test_mean_is_correct() {
let x = array![[1.0, 4.0], [3.0, 6.0]];
let pca = Pca { n_components: 2 };
let fitted = FitUnsupervised::<f64>::fit(&pca, &x).unwrap();
assert_abs_diff_eq!(fitted.mean()[0], 2.0, epsilon = 1e-10);
assert_abs_diff_eq!(fitted.mean()[1], 5.0, epsilon = 1e-10);
}
#[test]
fn test_shape_mismatch_on_transform() {
let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
let pca = Pca { n_components: 1 };
let fitted = FitUnsupervised::<f64>::fit(&pca, &x).unwrap();
let wrong = array![[1.0, 2.0, 3.0]];
assert!(fitted.transform(&wrong).is_err());
}
#[test]
fn test_empty_input() {
let x = Array2::<f64>::zeros((0, 3));
let pca = Pca { n_components: 1 };
let result = FitUnsupervised::<f64>::fit(&pca, &x);
assert!(result.is_err());
}
#[test]
fn test_single_sample_error() {
let x = array![[1.0, 2.0, 3.0]];
let pca = Pca { n_components: 1 };
let result = FitUnsupervised::<f64>::fit(&pca, &x);
assert!(result.is_err());
}
#[test]
fn test_constant_features() {
let x = array![[1.0, 2.0], [1.0, 2.0], [1.0, 2.0], [1.0, 2.0]];
let pca = Pca { n_components: 2 };
let fitted = FitUnsupervised::<f64>::fit(&pca, &x).unwrap();
for &v in fitted.explained_variance().iter() {
assert!(v.abs() < 1e-10, "expected near-zero variance, got {}", v);
}
}
#[test]
fn test_large_values() {
let x = array![[1e10, 2e10], [3e10, 4e10], [5e10, 6e10], [7e10, 8e10],];
let pca = Pca { n_components: 2 };
let fitted = FitUnsupervised::<f64>::fit(&pca, &x).unwrap();
let transformed = fitted.transform(&x).unwrap();
for &v in transformed.iter() {
assert!(
v.is_finite(),
"PCA on large values produced non-finite: {}",
v
);
}
for &v in fitted.explained_variance().iter() {
assert!(
v.is_finite() && v >= 0.0,
"variance should be finite and non-negative: {}",
v
);
}
}
#[test]
fn test_near_zero_variance_column() {
let x = array![
[1.0, 5.0],
[2.0, 5.0 + 1e-14],
[3.0, 5.0 - 1e-14],
[4.0, 5.0],
];
let pca = Pca { n_components: 2 };
let fitted = FitUnsupervised::<f64>::fit(&pca, &x).unwrap();
let transformed = fitted.transform(&x).unwrap();
for &v in transformed.iter() {
assert!(
v.is_finite(),
"near-zero variance column produced non-finite: {}",
v
);
}
let var = fitted.explained_variance();
assert!(var[0] > var[1] * 1e6, "first component should dominate");
}
#[test]
fn test_collinear_features() {
let x = array![
[1.0, 2.0, 0.5],
[2.0, 4.0, 1.0],
[3.0, 6.0, 1.5],
[4.0, 8.0, 2.0],
[5.0, 10.0, 2.5],
];
let pca = Pca { n_components: 3 };
let fitted = FitUnsupervised::<f64>::fit(&pca, &x).unwrap();
let var = fitted.explained_variance();
for &v in var.iter() {
assert!(
v.is_finite() && v >= -1e-10,
"variance should be finite and non-negative: {}",
v
);
}
let nonzero_count = var.iter().filter(|&&v| v > 1e-8).count();
assert!(
nonzero_count <= 2,
"collinear data should have rank <= 2, got {} non-zero eigenvalues",
nonzero_count
);
}
}