1use ferrolearn_core::error::FerroError;
32use ferrolearn_core::pipeline::{FittedPipelineTransformer, PipelineTransformer};
33use ferrolearn_core::traits::{Fit, Transform};
34use ndarray::{Array1, Array2};
35use num_traits::Float;
36use std::any::TypeId;
37
38#[derive(Debug, Clone)]
48pub struct PCA<F> {
49 n_components: usize,
51 _marker: std::marker::PhantomData<F>,
52}
53
54impl<F: Float + Send + Sync + 'static> PCA<F> {
55 #[must_use]
62 pub fn new(n_components: usize) -> Self {
63 Self {
64 n_components,
65 _marker: std::marker::PhantomData,
66 }
67 }
68
69 #[must_use]
71 pub fn n_components(&self) -> usize {
72 self.n_components
73 }
74}
75
76#[derive(Debug, Clone)]
87pub struct FittedPCA<F> {
88 components_: Array2<F>,
91
92 explained_variance_: Array1<F>,
95
96 explained_variance_ratio_: Array1<F>,
98
99 mean_: Array1<F>,
101
102 singular_values_: Array1<F>,
104}
105
106impl<F: Float + Send + Sync + 'static> FittedPCA<F> {
107 #[must_use]
109 pub fn components(&self) -> &Array2<F> {
110 &self.components_
111 }
112
113 #[must_use]
115 pub fn explained_variance(&self) -> &Array1<F> {
116 &self.explained_variance_
117 }
118
119 #[must_use]
121 pub fn explained_variance_ratio(&self) -> &Array1<F> {
122 &self.explained_variance_ratio_
123 }
124
125 #[must_use]
127 pub fn mean(&self) -> &Array1<F> {
128 &self.mean_
129 }
130
131 #[must_use]
133 pub fn singular_values(&self) -> &Array1<F> {
134 &self.singular_values_
135 }
136
137 pub fn inverse_transform(&self, x_reduced: &Array2<F>) -> Result<Array2<F>, FerroError> {
146 let n_components = self.components_.nrows();
147 if x_reduced.ncols() != n_components {
148 return Err(FerroError::ShapeMismatch {
149 expected: vec![x_reduced.nrows(), n_components],
150 actual: vec![x_reduced.nrows(), x_reduced.ncols()],
151 context: "FittedPCA::inverse_transform".into(),
152 });
153 }
154 let mut result = x_reduced.dot(&self.components_);
156 for mut row in result.rows_mut() {
157 for (v, &m) in row.iter_mut().zip(self.mean_.iter()) {
158 *v = *v + m;
159 }
160 }
161 Ok(result)
162 }
163}
164
165fn jacobi_eigen<F: Float + Send + Sync + 'static>(
176 a: &Array2<F>,
177 max_iter: usize,
178) -> Result<(Array1<F>, Array2<F>), FerroError> {
179 let n = a.nrows();
180 let mut mat = a.to_owned();
181 let mut v = Array2::<F>::zeros((n, n));
182 for i in 0..n {
184 v[[i, i]] = F::one();
185 }
186
187 let tol = F::from(1e-12).unwrap_or(F::epsilon());
188
189 for iteration in 0..max_iter {
190 let mut max_off = F::zero();
192 let mut p = 0;
193 let mut q = 1;
194 for i in 0..n {
195 for j in (i + 1)..n {
196 let val = mat[[i, j]].abs();
197 if val > max_off {
198 max_off = val;
199 p = i;
200 q = j;
201 }
202 }
203 }
204
205 if max_off < tol {
206 let eigenvalues = Array1::from_shape_fn(n, |i| mat[[i, i]]);
208 return Ok((eigenvalues, v));
209 }
210
211 let app = mat[[p, p]];
213 let aqq = mat[[q, q]];
214 let apq = mat[[p, q]];
215
216 let theta = if (app - aqq).abs() < tol {
217 F::from(std::f64::consts::FRAC_PI_4).unwrap_or(F::one())
218 } else {
219 let tau = (aqq - app) / (F::from(2.0).unwrap() * apq);
220 let t = if tau >= F::zero() {
222 F::one() / (tau.abs() + (F::one() + tau * tau).sqrt())
223 } else {
224 -F::one() / (tau.abs() + (F::one() + tau * tau).sqrt())
225 };
226 t.atan()
227 };
228
229 let c = theta.cos();
230 let s = theta.sin();
231
232 let mut new_mat = mat.clone();
235
236 for i in 0..n {
237 if i != p && i != q {
238 let mip = mat[[i, p]];
239 let miq = mat[[i, q]];
240 new_mat[[i, p]] = c * mip - s * miq;
241 new_mat[[p, i]] = new_mat[[i, p]];
242 new_mat[[i, q]] = s * mip + c * miq;
243 new_mat[[q, i]] = new_mat[[i, q]];
244 }
245 }
246
247 new_mat[[p, p]] = c * c * app - F::from(2.0).unwrap() * s * c * apq + s * s * aqq;
248 new_mat[[q, q]] = s * s * app + F::from(2.0).unwrap() * s * c * apq + c * c * aqq;
249 new_mat[[p, q]] = F::zero();
250 new_mat[[q, p]] = F::zero();
251
252 mat = new_mat;
253
254 for i in 0..n {
256 let vip = v[[i, p]];
257 let viq = v[[i, q]];
258 v[[i, p]] = c * vip - s * viq;
259 v[[i, q]] = s * vip + c * viq;
260 }
261
262 let _ = iteration; }
264
265 Err(FerroError::ConvergenceFailure {
266 iterations: max_iter,
267 message: "Jacobi eigendecomposition did not converge".into(),
268 })
269}
270
271fn faer_eigen_f64(a: &Array2<f64>) -> Result<(Array1<f64>, Array2<f64>), FerroError> {
280 let n = a.nrows();
281 let mat = faer::Mat::from_fn(n, n, |i, j| a[[i, j]]);
282 let decomp = mat.self_adjoint_eigen(faer::Side::Lower).map_err(|e| {
283 FerroError::NumericalInstability {
284 message: format!("faer symmetric eigendecomposition failed: {e:?}"),
285 }
286 })?;
287
288 let eigenvalues = Array1::from_shape_fn(n, |i| decomp.S().column_vector()[i]);
289 let eigenvectors = Array2::from_shape_fn((n, n), |(i, j)| decomp.U()[(i, j)]);
290
291 Ok((eigenvalues, eigenvectors))
292}
293
294fn faer_eigen_f32(a: &Array2<f32>) -> Result<(Array1<f32>, Array2<f32>), FerroError> {
297 let n = a.nrows();
298 let mat = faer::Mat::from_fn(n, n, |i, j| a[[i, j]]);
299 let decomp = mat.self_adjoint_eigen(faer::Side::Lower).map_err(|e| {
300 FerroError::NumericalInstability {
301 message: format!("faer symmetric eigendecomposition failed: {e:?}"),
302 }
303 })?;
304
305 let eigenvalues = Array1::from_shape_fn(n, |i| decomp.S().column_vector()[i]);
306 let eigenvectors = Array2::from_shape_fn((n, n), |(i, j)| decomp.U()[(i, j)]);
307
308 Ok((eigenvalues, eigenvectors))
309}
310
311fn eigen_dispatch<F: Float + Send + Sync + 'static>(
317 a: &Array2<F>,
318 max_jacobi_iter: usize,
319) -> Result<(Array1<F>, Array2<F>), FerroError> {
320 if TypeId::of::<F>() == TypeId::of::<f64>() {
324 let a_f64: &Array2<f64> = unsafe { &*(std::ptr::from_ref(a).cast::<Array2<f64>>()) };
326 let (eigenvalues, eigenvectors) = faer_eigen_f64(a_f64)?;
327 let eigenvalues_f: Array1<F> =
329 unsafe { std::mem::transmute_copy::<Array1<f64>, Array1<F>>(&eigenvalues) };
330 let eigenvectors_f: Array2<F> =
331 unsafe { std::mem::transmute_copy::<Array2<f64>, Array2<F>>(&eigenvectors) };
332 std::mem::forget(eigenvalues);
334 std::mem::forget(eigenvectors);
335 Ok((eigenvalues_f, eigenvectors_f))
336 } else if TypeId::of::<F>() == TypeId::of::<f32>() {
337 let a_f32: &Array2<f32> = unsafe { &*(std::ptr::from_ref(a).cast::<Array2<f32>>()) };
338 let (eigenvalues, eigenvectors) = faer_eigen_f32(a_f32)?;
339 let eigenvalues_f: Array1<F> =
340 unsafe { std::mem::transmute_copy::<Array1<f32>, Array1<F>>(&eigenvalues) };
341 let eigenvectors_f: Array2<F> =
342 unsafe { std::mem::transmute_copy::<Array2<f32>, Array2<F>>(&eigenvectors) };
343 std::mem::forget(eigenvalues);
344 std::mem::forget(eigenvectors);
345 Ok((eigenvalues_f, eigenvectors_f))
346 } else {
347 jacobi_eigen(a, max_jacobi_iter)
349 }
350}
351
352impl<F: Float + Send + Sync + 'static> Fit<Array2<F>, ()> for PCA<F> {
357 type Fitted = FittedPCA<F>;
358 type Error = FerroError;
359
360 fn fit(&self, x: &Array2<F>, _y: &()) -> Result<FittedPCA<F>, FerroError> {
370 let (n_samples, n_features) = x.dim();
371
372 if self.n_components == 0 {
373 return Err(FerroError::InvalidParameter {
374 name: "n_components".into(),
375 reason: "must be at least 1".into(),
376 });
377 }
378 if self.n_components > n_features {
379 return Err(FerroError::InvalidParameter {
380 name: "n_components".into(),
381 reason: format!(
382 "n_components ({}) exceeds n_features ({})",
383 self.n_components, n_features
384 ),
385 });
386 }
387 if n_samples < 2 {
388 return Err(FerroError::InsufficientSamples {
389 required: 2,
390 actual: n_samples,
391 context: "PCA::fit requires at least 2 samples".into(),
392 });
393 }
394
395 let n_f = F::from(n_samples).unwrap();
396
397 let mut mean = Array1::<F>::zeros(n_features);
399 for j in 0..n_features {
400 let col = x.column(j);
401 let sum = col.iter().copied().fold(F::zero(), |a, b| a + b);
402 mean[j] = sum / n_f;
403 }
404
405 let mut x_centered = x.to_owned();
406 for mut row in x_centered.rows_mut() {
407 for (v, &m) in row.iter_mut().zip(mean.iter()) {
408 *v = *v - m;
409 }
410 }
411
412 let n_minus_1 = F::from(n_samples - 1).unwrap();
414 let xt = x_centered.t();
415 let mut cov = xt.dot(&x_centered);
416 cov.mapv_inplace(|v| v / n_minus_1);
417
418 let max_jacobi_iter = n_features * n_features * 100 + 1000;
420 let (eigenvalues, eigenvectors) = eigen_dispatch(&cov, max_jacobi_iter)?;
421
422 let mut indices: Vec<usize> = (0..n_features).collect();
424 indices.sort_by(|&a, &b| {
425 eigenvalues[b]
426 .partial_cmp(&eigenvalues[a])
427 .unwrap_or(std::cmp::Ordering::Equal)
428 });
429
430 let total_variance = eigenvalues.iter().copied().fold(F::zero(), |a, b| a + b);
431
432 let n_comp = self.n_components;
433 let mut components = Array2::<F>::zeros((n_comp, n_features));
434 let mut explained_variance = Array1::<F>::zeros(n_comp);
435 let mut explained_variance_ratio = Array1::<F>::zeros(n_comp);
436 let mut singular_values = Array1::<F>::zeros(n_comp);
437
438 for (k, &idx) in indices.iter().take(n_comp).enumerate() {
439 let eigval = eigenvalues[idx];
440 let eigval_clamped = if eigval < F::zero() {
442 F::zero()
443 } else {
444 eigval
445 };
446 explained_variance[k] = eigval_clamped;
447 explained_variance_ratio[k] = if total_variance > F::zero() {
448 eigval_clamped / total_variance
449 } else {
450 F::zero()
451 };
452 singular_values[k] = (eigval_clamped * n_minus_1).sqrt();
454
455 for j in 0..n_features {
458 components[[k, j]] = eigenvectors[[j, idx]];
459 }
460 }
461
462 Ok(FittedPCA {
463 components_: components,
464 explained_variance_: explained_variance,
465 explained_variance_ratio_: explained_variance_ratio,
466 mean_: mean,
467 singular_values_: singular_values,
468 })
469 }
470}
471
472impl<F: Float + Send + Sync + 'static> Transform<Array2<F>> for FittedPCA<F> {
473 type Output = Array2<F>;
474 type Error = FerroError;
475
476 fn transform(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
483 let n_features = self.mean_.len();
484 if x.ncols() != n_features {
485 return Err(FerroError::ShapeMismatch {
486 expected: vec![x.nrows(), n_features],
487 actual: vec![x.nrows(), x.ncols()],
488 context: "FittedPCA::transform".into(),
489 });
490 }
491
492 let mut x_centered = x.to_owned();
494 for mut row in x_centered.rows_mut() {
495 for (v, &m) in row.iter_mut().zip(self.mean_.iter()) {
496 *v = *v - m;
497 }
498 }
499
500 Ok(x_centered.dot(&self.components_.t()))
502 }
503}
504
505impl<F: Float + Send + Sync + 'static> PipelineTransformer<F> for PCA<F> {
510 fn fit_pipeline(
518 &self,
519 x: &Array2<F>,
520 _y: &Array1<F>,
521 ) -> Result<Box<dyn FittedPipelineTransformer<F>>, FerroError> {
522 let fitted = self.fit(x, &())?;
523 Ok(Box::new(fitted))
524 }
525}
526
527impl<F: Float + Send + Sync + 'static> FittedPipelineTransformer<F> for FittedPCA<F> {
528 fn transform_pipeline(&self, x: &Array2<F>) -> Result<Array2<F>, FerroError> {
534 self.transform(x)
535 }
536}
537
538#[cfg(test)]
543mod tests {
544 use super::*;
545 use approx::assert_abs_diff_eq;
546 use ndarray::array;
547
548 #[test]
549 fn test_pca_dimensionality_reduction() {
550 let pca = PCA::<f64>::new(1);
551 let x = array![
552 [1.0, 2.0, 3.0],
553 [4.0, 5.0, 6.0],
554 [7.0, 8.0, 9.0],
555 [10.0, 11.0, 12.0],
556 ];
557 let fitted = pca.fit(&x, &()).unwrap();
558 let projected = fitted.transform(&x).unwrap();
559 assert_eq!(projected.dim(), (4, 1));
560 }
561
562 #[test]
563 fn test_pca_explained_variance_ratio_sums_le_1() {
564 let pca = PCA::<f64>::new(2);
565 let x = array![
566 [2.5, 2.4],
567 [0.5, 0.7],
568 [2.2, 2.9],
569 [1.9, 2.2],
570 [3.1, 3.0],
571 [2.3, 2.7],
572 [2.0, 1.6],
573 [1.0, 1.1],
574 [1.5, 1.6],
575 [1.1, 0.9],
576 ];
577 let fitted = pca.fit(&x, &()).unwrap();
578 let ratio_sum: f64 = fitted.explained_variance_ratio().iter().sum();
579 assert!(ratio_sum <= 1.0 + 1e-10, "ratio sum = {ratio_sum}");
581 }
582
583 #[test]
584 fn test_pca_explained_variance_ratio_partial() {
585 let pca = PCA::<f64>::new(1);
586 let x = array![
587 [2.5, 2.4],
588 [0.5, 0.7],
589 [2.2, 2.9],
590 [1.9, 2.2],
591 [3.1, 3.0],
592 [2.3, 2.7],
593 [2.0, 1.6],
594 [1.0, 1.1],
595 [1.5, 1.6],
596 [1.1, 0.9],
597 ];
598 let fitted = pca.fit(&x, &()).unwrap();
599 let ratio_sum: f64 = fitted.explained_variance_ratio().iter().sum();
600 assert!(ratio_sum <= 1.0 + 1e-10);
602 assert!(ratio_sum > 0.0);
603 }
604
605 #[test]
606 fn test_pca_components_orthonormal() {
607 let pca = PCA::<f64>::new(2);
608 let x = array![
609 [2.5, 2.4],
610 [0.5, 0.7],
611 [2.2, 2.9],
612 [1.9, 2.2],
613 [3.1, 3.0],
614 [2.3, 2.7],
615 [2.0, 1.6],
616 [1.0, 1.1],
617 [1.5, 1.6],
618 [1.1, 0.9],
619 ];
620 let fitted = pca.fit(&x, &()).unwrap();
621 let c = fitted.components();
622
623 for i in 0..c.nrows() {
625 let norm: f64 = c.row(i).iter().map(|v| v * v).sum::<f64>().sqrt();
626 assert_abs_diff_eq!(norm, 1.0, epsilon = 1e-8);
627 }
628
629 for i in 0..c.nrows() {
631 for j in (i + 1)..c.nrows() {
632 let dot: f64 = c
633 .row(i)
634 .iter()
635 .zip(c.row(j).iter())
636 .map(|(a, b)| a * b)
637 .sum();
638 assert_abs_diff_eq!(dot, 0.0, epsilon = 1e-8);
639 }
640 }
641 }
642
643 #[test]
644 fn test_pca_inverse_transform_roundtrip() {
645 let pca = PCA::<f64>::new(2);
646 let x = array![[2.5, 2.4], [0.5, 0.7], [2.2, 2.9], [1.9, 2.2], [3.1, 3.0],];
647 let fitted = pca.fit(&x, &()).unwrap();
648 let projected = fitted.transform(&x).unwrap();
649 let recovered = fitted.inverse_transform(&projected).unwrap();
650
651 for (a, b) in x.iter().zip(recovered.iter()) {
653 assert_abs_diff_eq!(a, b, epsilon = 1e-8);
654 }
655 }
656
657 #[test]
658 fn test_pca_inverse_transform_approx() {
659 let pca = PCA::<f64>::new(1);
662 let x = array![[2.5, 2.4], [0.5, 0.7], [2.2, 2.9], [1.9, 2.2], [3.1, 3.0],];
663 let fitted = pca.fit(&x, &()).unwrap();
664 let projected = fitted.transform(&x).unwrap();
665 let recovered = fitted.inverse_transform(&projected).unwrap();
666
667 let total_error: f64 = x
669 .iter()
670 .zip(recovered.iter())
671 .map(|(a, b)| (a - b).powi(2))
672 .sum();
673 let total_var: f64 = {
674 let mean_x: f64 = x.iter().sum::<f64>() / x.len() as f64;
675 x.iter().map(|&v| (v - mean_x).powi(2)).sum()
676 };
677 assert!(
679 total_error < total_var,
680 "error={total_error}, var={total_var}"
681 );
682 }
683
684 #[test]
685 fn test_pca_n_components_equals_n_features() {
686 let pca = PCA::<f64>::new(3);
687 let x = array![
688 [1.0, 2.0, 3.0],
689 [4.0, 5.0, 6.0],
690 [7.0, 8.0, 9.0],
691 [10.0, 11.0, 12.0],
692 ];
693 let fitted = pca.fit(&x, &()).unwrap();
694 let ratio_sum: f64 = fitted.explained_variance_ratio().iter().sum();
695 assert_abs_diff_eq!(ratio_sum, 1.0, epsilon = 1e-8);
696 }
697
698 #[test]
699 fn test_pca_single_component() {
700 let pca = PCA::<f64>::new(1);
701 let x = array![[1.0, 0.0], [0.0, 1.0], [1.0, 1.0], [0.0, 0.0],];
702 let fitted = pca.fit(&x, &()).unwrap();
703 assert_eq!(fitted.components().nrows(), 1);
704 assert_eq!(fitted.explained_variance().len(), 1);
705 }
706
707 #[test]
708 fn test_pca_shape_mismatch_transform() {
709 let pca = PCA::<f64>::new(1);
710 let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
711 let fitted = pca.fit(&x, &()).unwrap();
712 let x_bad = array![[1.0, 2.0, 3.0]];
713 assert!(fitted.transform(&x_bad).is_err());
714 }
715
716 #[test]
717 fn test_pca_shape_mismatch_inverse_transform() {
718 let pca = PCA::<f64>::new(1);
719 let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
720 let fitted = pca.fit(&x, &()).unwrap();
721 let x_bad = array![[1.0, 2.0, 3.0]];
723 assert!(fitted.inverse_transform(&x_bad).is_err());
724 }
725
726 #[test]
727 fn test_pca_invalid_n_components_zero() {
728 let pca = PCA::<f64>::new(0);
729 let x = array![[1.0, 2.0], [3.0, 4.0]];
730 assert!(pca.fit(&x, &()).is_err());
731 }
732
733 #[test]
734 fn test_pca_invalid_n_components_too_large() {
735 let pca = PCA::<f64>::new(5);
736 let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
737 assert!(pca.fit(&x, &()).is_err());
738 }
739
740 #[test]
741 fn test_pca_insufficient_samples() {
742 let pca = PCA::<f64>::new(1);
743 let x = array![[1.0, 2.0]]; assert!(pca.fit(&x, &()).is_err());
745 }
746
747 #[test]
748 fn test_pca_explained_variance_positive() {
749 let pca = PCA::<f64>::new(2);
750 let x = array![[2.5, 2.4], [0.5, 0.7], [2.2, 2.9], [1.9, 2.2], [3.1, 3.0],];
751 let fitted = pca.fit(&x, &()).unwrap();
752 for &v in fitted.explained_variance().iter() {
753 assert!(v >= 0.0, "negative variance: {v}");
754 }
755 }
756
757 #[test]
758 fn test_pca_singular_values_positive() {
759 let pca = PCA::<f64>::new(2);
760 let x = array![[2.5, 2.4], [0.5, 0.7], [2.2, 2.9], [1.9, 2.2], [3.1, 3.0],];
761 let fitted = pca.fit(&x, &()).unwrap();
762 for &s in fitted.singular_values().iter() {
763 assert!(s >= 0.0, "negative singular value: {s}");
764 }
765 }
766
767 #[test]
768 fn test_pca_f32() {
769 let pca = PCA::<f32>::new(1);
770 let x: Array2<f32> = array![[1.0f32, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0],];
771 let fitted = pca.fit(&x, &()).unwrap();
772 let projected = fitted.transform(&x).unwrap();
773 assert_eq!(projected.ncols(), 1);
774 }
775
776 #[test]
777 fn test_pca_n_components_getter() {
778 let pca = PCA::<f64>::new(3);
779 assert_eq!(pca.n_components(), 3);
780 }
781
782 #[test]
783 fn test_pca_pipeline_integration() {
784 use ferrolearn_core::pipeline::{FittedPipelineEstimator, Pipeline, PipelineEstimator};
785 use ferrolearn_core::traits::Predict;
786
787 struct SumEstimator;
789
790 impl PipelineEstimator<f64> for SumEstimator {
791 fn fit_pipeline(
792 &self,
793 _x: &Array2<f64>,
794 _y: &Array1<f64>,
795 ) -> Result<Box<dyn FittedPipelineEstimator<f64>>, FerroError> {
796 Ok(Box::new(FittedSumEstimator))
797 }
798 }
799
800 struct FittedSumEstimator;
801
802 impl FittedPipelineEstimator<f64> for FittedSumEstimator {
803 fn predict_pipeline(&self, x: &Array2<f64>) -> Result<Array1<f64>, FerroError> {
804 let sums: Vec<f64> = x.rows().into_iter().map(|r| r.sum()).collect();
805 Ok(Array1::from_vec(sums))
806 }
807 }
808
809 let pipeline = Pipeline::new()
810 .transform_step("pca", Box::new(PCA::<f64>::new(1)))
811 .estimator_step("sum", Box::new(SumEstimator));
812
813 let x = array![
814 [1.0, 2.0, 3.0],
815 [4.0, 5.0, 6.0],
816 [7.0, 8.0, 9.0],
817 [10.0, 11.0, 12.0],
818 ];
819 let y = Array1::from_vec(vec![0.0, 1.0, 2.0, 3.0]);
820
821 let fitted = pipeline.fit(&x, &y).unwrap();
822 let preds = fitted.predict(&x).unwrap();
823 assert_eq!(preds.len(), 4);
824 }
825}