#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde-json", derive(serde::Serialize, serde::Deserialize))]
pub struct CCIPCA {
n_components: usize,
components: Vec<Vec<f64>>,
eigenvalues: Vec<f64>,
mean: Vec<f64>,
count: u64,
n_features: Option<usize>,
amnestic: f64,
}
impl CCIPCA {
pub fn new(n_components: usize) -> Self {
Self::with_amnestic(n_components, 0.0)
}
pub fn with_amnestic(n_components: usize, amnestic: f64) -> Self {
Self {
n_components,
components: Vec::new(),
eigenvalues: Vec::new(),
mean: Vec::new(),
count: 0,
n_features: None,
amnestic,
}
}
pub fn update(&mut self, features: &[f64]) {
let _d = self.init_or_check(features.len());
self.count += 1;
let n = self.count;
let n_f = n as f64;
for (j, &fj) in features.iter().enumerate() {
let delta = fj - self.mean[j];
self.mean[j] += delta / n_f;
}
let mut x_c: Vec<f64> = features
.iter()
.zip(self.mean.iter())
.map(|(&f, &m)| f - m)
.collect();
for i in 0..self.n_components {
if n <= (i as u64) + 1 {
self.components[i] = x_c.clone();
self.eigenvalues[i] = norm(&x_c);
if self.eigenvalues[i] > 0.0 {
for v in &mut self.components[i] {
*v /= self.eigenvalues[i];
}
}
} else {
let w = (n_f - 1.0 - self.amnestic) / n_f;
let proj = dot(&x_c, &self.components[i]);
let scale = (1.0 + self.amnestic) / n_f * proj;
let v_i: Vec<f64> = self.components[i]
.iter()
.zip(x_c.iter())
.map(|(&ci, &xj)| w * self.eigenvalues[i] * ci + scale * xj)
.collect();
let norm_v = norm(&v_i);
self.eigenvalues[i] = norm_v;
if norm_v > 0.0 {
let inv = 1.0 / norm_v;
for (cj, &vj) in self.components[i].iter_mut().zip(v_i.iter()) {
*cj = vj * inv;
}
}
}
let proj = dot(&x_c, &self.components[i]);
for (xj, &cj) in x_c.iter_mut().zip(self.components[i].iter()) {
*xj -= proj * cj;
}
}
self.sort_components();
}
pub fn transform(&self, features: &[f64]) -> Vec<f64> {
let _d = self.check_features_len(features.len());
assert!(
self.count > 0,
"CCIPCA: cannot transform before any updates"
);
let mut out = Vec::with_capacity(self.n_components);
for component in &self.components {
let proj: f64 = features
.iter()
.zip(self.mean.iter())
.zip(component.iter())
.map(|((&f, &m), &c)| (f - m) * c)
.sum();
out.push(proj);
}
out
}
pub fn update_and_transform(&mut self, features: &[f64]) -> Vec<f64> {
self.update(features);
self.transform(features)
}
pub fn reset(&mut self) {
self.components.clear();
self.eigenvalues.clear();
self.mean.clear();
self.count = 0;
self.n_features = None;
}
pub fn n_components(&self) -> usize {
self.n_components
}
pub fn n_features(&self) -> Option<usize> {
self.n_features
}
pub fn count(&self) -> u64 {
self.count
}
pub fn eigenvalues(&self) -> &[f64] {
&self.eigenvalues
}
pub fn explained_variance_ratio(&self) -> Vec<f64> {
let total: f64 = self.eigenvalues.iter().sum();
if total <= 0.0 {
return vec![0.0; self.eigenvalues.len()];
}
self.eigenvalues.iter().map(|&ev| ev / total).collect()
}
fn init_or_check(&mut self, len: usize) -> usize {
match self.n_features {
None => {
let d = len;
self.n_features = Some(d);
self.mean = vec![0.0; d];
self.components = vec![vec![0.0; d]; self.n_components];
self.eigenvalues = vec![0.0; self.n_components];
d
}
Some(d) => {
assert_eq!(len, d, "CCIPCA: expected {} features, got {}", d, len);
d
}
}
}
fn check_features_len(&self, len: usize) -> usize {
let d = self
.n_features
.expect("CCIPCA: n_features not set (call update first)");
assert_eq!(len, d, "CCIPCA: expected {} features, got {}", d, len);
d
}
fn sort_components(&mut self) {
let k = self.n_components;
for i in 1..k {
let mut j = i;
while j > 0 && self.eigenvalues[j] > self.eigenvalues[j - 1] {
self.eigenvalues.swap(j, j - 1);
self.components.swap(j, j - 1);
j -= 1;
}
}
}
}
impl Default for CCIPCA {
fn default() -> Self {
Self::new(2)
}
}
impl crate::pipeline::StreamingPreprocessor for CCIPCA {
fn update_and_transform(&mut self, features: &[f64]) -> Vec<f64> {
CCIPCA::update_and_transform(self, features)
}
fn transform(&self, features: &[f64]) -> Vec<f64> {
CCIPCA::transform(self, features)
}
fn output_dim(&self) -> Option<usize> {
Some(self.n_components)
}
fn reset(&mut self) {
CCIPCA::reset(self);
}
}
#[inline]
fn dot(a: &[f64], b: &[f64]) -> f64 {
irithyll_core::simd::simd_dot(a, b)
}
#[inline]
fn norm(a: &[f64]) -> f64 {
dot(a, a).sqrt()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pipeline::StreamingPreprocessor;
const EPS: f64 = 1e-6;
#[test]
fn construction_and_initial_state() {
let pca = CCIPCA::new(3);
assert_eq!(pca.n_components(), 3);
assert_eq!(pca.n_features(), None);
assert_eq!(pca.count(), 0);
assert!(pca.eigenvalues().is_empty());
let pca2 = CCIPCA::with_amnestic(5, 2.0);
assert_eq!(pca2.n_components(), 5);
assert_eq!(pca2.count(), 0);
let pca3 = CCIPCA::default();
assert_eq!(pca3.n_components(), 2);
}
#[test]
fn lazy_init_sets_n_features() {
let mut pca = CCIPCA::new(2);
assert_eq!(pca.n_features(), None);
pca.update(&[1.0, 2.0, 3.0, 4.0]);
assert_eq!(pca.n_features(), Some(4));
assert_eq!(pca.count(), 1);
}
#[test]
fn known_principal_components() {
let mut pca = CCIPCA::new(2);
for i in 0..500 {
let x0 = ((i * 7 + 13) % 200) as f64 / 10.0 - 10.0;
let x1 = ((i * 3 + 7) % 200) as f64 / 100.0 - 1.0;
pca.update(&[x0, x1]);
}
assert_eq!(pca.count(), 500);
let c0 = &pca.components[0];
assert!(
c0[0].abs() > 0.9,
"first component x should be ~1.0, got {}",
c0[0]
);
assert!(
c0[1].abs() < 0.3,
"first component y should be ~0.0, got {}",
c0[1]
);
assert!(
pca.eigenvalues()[0] > pca.eigenvalues()[1],
"eigenvalue[0] = {}, eigenvalue[1] = {}",
pca.eigenvalues()[0],
pca.eigenvalues()[1]
);
let ratios = pca.explained_variance_ratio();
let total: f64 = ratios.iter().sum();
assert!(
(total - 1.0).abs() < EPS,
"explained variance ratios sum to {}, expected 1.0",
total
);
}
#[test]
fn components_are_orthogonal() {
let mut pca = CCIPCA::new(3);
for i in 0..1000 {
let x0 = ((i * 7 + 13) % 400) as f64 / 20.0 - 10.0; let x1 = ((i * 3 + 7) % 200) as f64 / 50.0 - 2.0; let x2 = ((i * 11 + 3) % 100) as f64 / 100.0 - 0.5; pca.update(&[x0, x1, x2]);
}
for i in 0..3 {
for j in (i + 1)..3 {
let d = dot(&pca.components[i], &pca.components[j]);
assert!(
d.abs() < 0.1,
"dot(component[{}], component[{}]) = {}, expected ~0",
i,
j,
d
);
}
}
for i in 0..3 {
let n = norm(&pca.components[i]);
assert!(
(n - 1.0).abs() < EPS,
"||component[{}]|| = {}, expected ~1.0",
i,
n
);
}
}
#[test]
fn dimension_reduction() {
let mut pca = CCIPCA::new(2);
for i in 0..100 {
let x = [
i as f64,
(i as f64) * 0.5,
(i as f64) * 0.1,
(i as f64) * 0.01,
(i as f64) * 0.001,
];
pca.update(&x);
}
let reduced = pca.transform(&[50.0, 25.0, 5.0, 0.5, 0.05]);
assert_eq!(reduced.len(), 2);
assert!(reduced[0].is_finite());
assert!(reduced[1].is_finite());
let reduced2 = pca.update_and_transform(&[51.0, 25.5, 5.1, 0.51, 0.051]);
assert_eq!(reduced2.len(), 2);
}
#[test]
fn streaming_preprocessor_trait() {
let mut pca = CCIPCA::new(2);
for i in 0..50 {
pca.update(&[i as f64, (i as f64) * 2.0, (i as f64) * 3.0]);
}
let mut boxed: Box<dyn StreamingPreprocessor> = Box::new(pca);
assert_eq!(boxed.output_dim(), Some(2));
let out = boxed.update_and_transform(&[25.0, 50.0, 75.0]);
assert_eq!(out.len(), 2);
let out2 = boxed.transform(&[25.0, 50.0, 75.0]);
assert_eq!(out2.len(), 2);
boxed.reset();
assert_eq!(boxed.output_dim(), Some(2));
}
#[test]
fn reset_clears_state() {
let mut pca = CCIPCA::with_amnestic(3, 1.0);
for i in 0..100 {
pca.update(&[i as f64, (i as f64) * 0.5]);
}
assert_eq!(pca.count(), 100);
assert_eq!(pca.n_features(), Some(2));
pca.reset();
assert_eq!(pca.count(), 0);
assert_eq!(pca.n_features(), None);
assert!(pca.eigenvalues().is_empty());
assert_eq!(pca.n_components(), 3);
pca.update(&[1.0, 2.0, 3.0]);
assert_eq!(pca.n_features(), Some(3));
assert_eq!(pca.count(), 1);
}
}