1use ndarray::{Array1, Array2, Axis};
14use rand::Rng;
15
16#[derive(Debug, Clone, Default)]
18pub enum PcaSolver {
19 #[default]
22 Auto,
23 Randomized {
25 n_oversamples: usize,
27 n_power_iters: usize,
29 },
30 PowerIteration,
32}
33
34pub struct PcaConfig {
36 pub n_components: usize,
38 pub max_iterations: usize,
40 pub tolerance: f64,
42 pub center: bool,
44 pub scale: bool,
46 pub solver: PcaSolver,
48}
49
50impl Default for PcaConfig {
51 fn default() -> Self {
52 Self {
53 n_components: 2,
54 max_iterations: 100,
55 tolerance: 1e-6,
56 center: true,
57 scale: false,
58 solver: PcaSolver::Auto,
59 }
60 }
61}
62
63pub struct PcaResult {
65 pub components: Vec<Vec<f64>>,
67 pub explained_variance: Vec<f64>,
69 pub explained_variance_ratio: Vec<f64>,
71 pub mean: Vec<f64>,
73 pub std_dev: Vec<f64>,
75 pub n_samples: usize,
77 pub n_features: usize,
79 pub iterations_used: usize,
81}
82
83impl PcaResult {
84 pub fn transform(&self, data: &[Vec<f64>]) -> Vec<Vec<f64>> {
88 let n = data.len();
89 if n == 0 || self.components.is_empty() {
90 return vec![];
91 }
92 let d = self.n_features;
93 let k = self.components.len();
94
95 let comp_flat: Vec<f64> = self
97 .components
98 .iter()
99 .flat_map(|r| r.iter().copied())
100 .collect();
101 let comp_mat = Array2::from_shape_vec((k, d), comp_flat).unwrap();
102
103 let data_flat: Vec<f64> = data
105 .iter()
106 .flat_map(|row| {
107 row.iter().enumerate().map(|(j, &val)| {
108 let mut v = val - self.mean[j];
109 if self.std_dev[j] > 0.0 && self.std_dev[j] != 1.0 {
110 v /= self.std_dev[j];
111 }
112 v
113 })
114 })
115 .collect();
116 let data_mat = Array2::from_shape_vec((n, d), data_flat).unwrap();
117
118 let projected = data_mat.dot(&comp_mat.t());
120
121 projected
123 .rows()
124 .into_iter()
125 .map(|row| row.to_vec())
126 .collect()
127 }
128
129 pub fn transform_one(&self, point: &[f64]) -> Vec<f64> {
131 let k = self.components.len();
132 let d = self.n_features;
133 let mut result = vec![0.0; k];
134
135 for (c, component) in self.components.iter().enumerate() {
136 let mut dot = 0.0;
137 for j in 0..d {
138 let mut val = point[j] - self.mean[j];
139 if self.std_dev[j] > 0.0 && self.std_dev[j] != 1.0 {
140 val /= self.std_dev[j];
141 }
142 dot += val * component[j];
143 }
144 result[c] = dot;
145 }
146 result
147 }
148}
149
150pub fn pca(data: &[Vec<f64>], config: PcaConfig) -> PcaResult {
159 let n = data.len();
160 assert!(n > 0, "PCA requires at least one data point");
161 let d = data[0].len();
162 assert!(d > 0, "PCA requires at least one feature");
163
164 let k = config.n_components.min(d).min(n);
165
166 let flat: Vec<f64> = data
168 .iter()
169 .flat_map(|row| {
170 assert_eq!(
171 row.len(),
172 d,
173 "All rows must have the same number of features"
174 );
175 row.iter().copied()
176 })
177 .collect();
178 let mut mat = Array2::from_shape_vec((n, d), flat).unwrap();
179
180 let mut mean = vec![0.0; d];
182 if config.center {
183 let mean_arr = mat.mean_axis(Axis(0)).unwrap();
184 mat -= &mean_arr;
185 mean = mean_arr.to_vec();
186 }
187
188 let mut std_dev = vec![1.0; d];
190 if config.scale {
191 #[allow(clippy::needless_range_loop)]
192 for j in 0..d {
193 let col = mat.column(j);
194 let ss: f64 = col.iter().map(|x| x * x).sum();
195 let s = (ss / (n.max(2) - 1) as f64).sqrt();
196 std_dev[j] = s;
197 if s > 0.0 {
198 mat.column_mut(j).mapv_inplace(|x| x / s);
199 }
200 }
201 }
202
203 let use_randomized = match &config.solver {
205 PcaSolver::Auto => {
206 let min_dim = n.min(d);
207 n > 500 && (k as f64) < 0.8 * min_dim as f64
208 }
209 PcaSolver::Randomized { .. } => true,
210 PcaSolver::PowerIteration => false,
211 };
212
213 let (n_oversamples, n_power_iters) = match &config.solver {
214 PcaSolver::Randomized {
215 n_oversamples,
216 n_power_iters,
217 } => (*n_oversamples, *n_power_iters),
218 _ => (10, 4),
219 };
220
221 if use_randomized {
222 let (components, singular_values) = randomized_svd(&mat, k, n_oversamples, n_power_iters);
224
225 let denom = if n > 1 { (n - 1) as f64 } else { 1.0 };
227 let eigenvalues: Vec<f64> = singular_values.iter().map(|&s| s * s / denom).collect();
228
229 let total_variance = compute_total_variance(&mat, denom);
231
232 let explained_variance_ratio: Vec<f64> = eigenvalues
233 .iter()
234 .map(|&ev| {
235 if total_variance > 0.0 {
236 ev / total_variance
237 } else {
238 0.0
239 }
240 })
241 .collect();
242
243 PcaResult {
244 components,
245 explained_variance: eigenvalues,
246 explained_variance_ratio,
247 mean,
248 std_dev,
249 n_samples: n,
250 n_features: d,
251 iterations_used: n_power_iters,
252 }
253 } else {
254 let xt = mat.t();
256 let denom = if n > 1 { (n - 1) as f64 } else { 1.0 };
257 let mut cov = xt.dot(&mat) / denom;
258
259 let total_variance: f64 = (0..d).map(|i| cov[[i, i]]).sum();
261
262 let mut components: Vec<Vec<f64>> = Vec::with_capacity(k);
263 let mut eigenvalues: Vec<f64> = Vec::with_capacity(k);
264 let mut last_iters = 0;
265
266 for _ in 0..k {
267 let (eigvec, _eigval, iters) =
268 power_iteration(&cov, config.max_iterations, config.tolerance);
269 last_iters = iters;
270
271 let mut new_vec = eigvec;
273 for prev in &components {
274 let dot: f64 = new_vec.iter().zip(prev).map(|(a, b)| a * b).sum();
275 for (v, p) in new_vec.iter_mut().zip(prev) {
276 *v -= dot * p;
277 }
278 }
279 let norm: f64 = new_vec.iter().map(|x| x * x).sum::<f64>().sqrt();
281 if norm > 1e-15 {
282 for v in &mut new_vec {
283 *v /= norm;
284 }
285 }
286
287 let v_arr = Array1::from(new_vec.clone());
289 let cv = cov.dot(&v_arr);
290 let eigval = v_arr.dot(&cv);
291
292 for r in 0..d {
294 for c in 0..d {
295 cov[[r, c]] -= eigval * new_vec[r] * new_vec[c];
296 }
297 }
298
299 components.push(new_vec);
300 eigenvalues.push(eigval);
301 }
302
303 let explained_variance_ratio: Vec<f64> = eigenvalues
304 .iter()
305 .map(|&ev| {
306 if total_variance > 0.0 {
307 ev / total_variance
308 } else {
309 0.0
310 }
311 })
312 .collect();
313
314 PcaResult {
315 components,
316 explained_variance: eigenvalues,
317 explained_variance_ratio,
318 mean,
319 std_dev,
320 n_samples: n,
321 n_features: d,
322 iterations_used: last_iters,
323 }
324 }
325}
326
327fn compute_total_variance(mat: &Array2<f64>, denom: f64) -> f64 {
329 let d = mat.ncols();
330 let mut total = 0.0;
331 for j in 0..d {
332 let col = mat.column(j);
333 let ss: f64 = col.iter().map(|x| x * x).sum();
334 total += ss / denom;
335 }
336 total
337}
338
339fn randomized_svd(
354 mat: &Array2<f64>, k: usize, n_oversamples: usize, n_power_iters: usize, ) -> (Vec<Vec<f64>>, Vec<f64>) {
359 let n = mat.nrows();
360 let d = mat.ncols();
361 let l = (k + n_oversamples).min(n).min(d); let mut rng = rand::thread_rng();
365 let omega_flat: Vec<f64> = (0..d * l)
366 .map(|_| rng.sample::<f64, _>(rand::distributions::Standard))
367 .collect();
368 let omega = Array2::from_shape_vec((d, l), omega_flat).unwrap();
369
370 let mut y = mat.dot(&omega);
372
373 for _ in 0..n_power_iters {
376 qr_modified_gram_schmidt(&mut y);
378 let xty = mat.t().dot(&y); y = mat.dot(&xty); }
382
383 qr_modified_gram_schmidt(&mut y);
385 let q = y; let b = q.t().dot(mat);
389
390 let bbt = b.dot(&b.t());
392 let l_actual = bbt.nrows();
393
394 let (eigvecs_left, eigvals) = symmetric_eigen(&bbt, l_actual);
397
398 let mut components = Vec::with_capacity(k);
401 let mut singular_values = Vec::with_capacity(k);
402
403 #[allow(clippy::needless_range_loop)]
404 for i in 0..k.min(l_actual) {
405 let sigma_sq = eigvals[i];
406 if sigma_sq < 1e-30 {
407 break;
408 }
409 let sigma = sigma_sq.sqrt();
410 singular_values.push(sigma);
411
412 let u_i = eigvecs_left.column(i);
414 let v_i = b.t().dot(&u_i) / sigma;
416 components.push(v_i.to_vec());
417 }
418
419 (components, singular_values)
420}
421
422fn symmetric_eigen(mat: &Array2<f64>, k: usize) -> (Array2<f64>, Vec<f64>) {
426 let d = mat.nrows();
427 let mut deflated = mat.clone();
428 let mut eigvecs = Array2::<f64>::zeros((d, k));
429 let mut eigvals = Vec::with_capacity(k);
430
431 for i in 0..k {
432 let (vec, _val, _) = power_iteration(&deflated, 200, 1e-12);
433
434 let mut v = Array1::from(vec);
436 for j in 0..i {
437 let prev = eigvecs.column(j);
438 let dot = prev.dot(&v);
439 v -= &(&prev * dot);
440 }
441 let norm = v.dot(&v).sqrt();
442 if norm > 1e-15 {
443 v /= norm;
444 }
445
446 let av = deflated.dot(&v);
448 let val = v.dot(&av);
449
450 for r in 0..d {
452 for c in 0..d {
453 deflated[[r, c]] -= val * v[r] * v[c];
454 }
455 }
456
457 eigvecs.column_mut(i).assign(&v);
458 eigvals.push(val);
459 }
460
461 (eigvecs, eigvals)
462}
463
464fn qr_modified_gram_schmidt(mat: &mut Array2<f64>) {
472 let ncols = mat.ncols();
473 for j in 0..ncols {
474 let mut col_j = mat.column(j).to_owned();
475 for i in 0..j {
476 let col_i = mat.column(i).to_owned();
477 let r = col_i.dot(&col_j);
478 col_j -= &(&col_i * r);
479 }
480 let norm = col_j.dot(&col_j).sqrt();
481 if norm > 1e-15 {
482 col_j /= norm;
483 }
484 mat.column_mut(j).assign(&col_j);
485 }
486}
487
488fn power_iteration(
496 matrix: &Array2<f64>,
497 max_iters: usize,
498 tolerance: f64,
499) -> (Vec<f64>, f64, usize) {
500 let d = matrix.nrows();
501
502 let mut v = Array1::<f64>::zeros(d);
504 for i in 0..d {
505 v[i] = ((i + 1) as f64).sqrt();
506 }
507 let norm = v.dot(&v).sqrt();
508 if norm > 0.0 {
509 v /= norm;
510 }
511
512 let mut iters = 0;
513 for iter in 0..max_iters {
514 iters = iter + 1;
515
516 let w = matrix.dot(&v);
518
519 let w_norm = w.dot(&w).sqrt();
521 if w_norm < 1e-15 {
522 break;
524 }
525 let v_new = &w / w_norm;
526
527 let diff_pos: f64 = v_new
529 .iter()
530 .zip(v.iter())
531 .map(|(a, b)| (a - b).powi(2))
532 .sum();
533 let diff_neg: f64 = v_new
534 .iter()
535 .zip(v.iter())
536 .map(|(a, b)| (a + b).powi(2))
537 .sum();
538 let diff = diff_pos.min(diff_neg).sqrt();
539
540 v = v_new;
541
542 if diff < tolerance {
543 break;
544 }
545 }
546
547 let cv = matrix.dot(&v);
549 let eigenvalue = v.dot(&cv);
550
551 (v.to_vec(), eigenvalue, iters)
552}
553
554#[cfg(test)]
555mod tests {
556 use super::*;
557
558 #[test]
559 fn test_pca_basic() {
560 let data: Vec<Vec<f64>> = (0..100)
562 .map(|i| {
563 let x = i as f64;
564 let y = x + (i as f64 * 0.1).sin() * 2.0; vec![x, y]
566 })
567 .collect();
568
569 let result = pca(
570 &data,
571 PcaConfig {
572 n_components: 2,
573 ..Default::default()
574 },
575 );
576
577 assert_eq!(result.components.len(), 2);
578 assert_eq!(result.explained_variance.len(), 2);
579
580 assert!(
582 result.explained_variance_ratio[0] > 0.9,
583 "First component should explain >90% variance, got {}",
584 result.explained_variance_ratio[0]
585 );
586
587 let c0 = &result.components[0];
589 let angle = (c0[0].abs() - c0[1].abs()).abs();
590 assert!(
591 angle < 0.2,
592 "First component should be near diagonal, got {:?}",
593 c0
594 );
595 }
596
597 #[test]
598 fn test_pca_identity() {
599 let data: Vec<Vec<f64>> = (0..50)
601 .map(|i| {
602 vec![
603 i as f64,
604 (i as f64 * 0.5).sin() * 10.0,
605 (i * i) as f64 % 17.0,
606 ]
607 })
608 .collect();
609
610 let result = pca(
611 &data,
612 PcaConfig {
613 n_components: 3,
614 ..Default::default()
615 },
616 );
617
618 let total: f64 = result.explained_variance_ratio.iter().sum();
619 assert!(
620 (total - 1.0).abs() < 0.01,
621 "Total explained variance ratio should be ~1.0, got {}",
622 total
623 );
624 }
625
626 #[test]
627 fn test_pca_explained_variance_ratios_sum() {
628 let data: Vec<Vec<f64>> = (0..200)
629 .map(|i| {
630 let x = i as f64 * 0.1;
631 vec![
632 x,
633 x * 2.0 + 1.0,
634 x.sin(),
635 x.cos(),
636 (x * 0.3).exp().min(100.0),
637 ]
638 })
639 .collect();
640
641 let result = pca(
642 &data,
643 PcaConfig {
644 n_components: 5,
645 ..Default::default()
646 },
647 );
648
649 let total: f64 = result.explained_variance_ratio.iter().sum();
650 assert!(
651 (total - 1.0).abs() < 0.05,
652 "Ratios should sum to ~1.0, got {}",
653 total
654 );
655
656 for i in 1..result.explained_variance_ratio.len() {
658 assert!(
659 result.explained_variance_ratio[i]
660 <= result.explained_variance_ratio[i - 1] + 1e-10,
661 "Ratios should be descending"
662 );
663 }
664 }
665
666 #[test]
667 fn test_pca_transform() {
668 let data: Vec<Vec<f64>> = (0..100)
670 .map(|i| {
671 let x = i as f64;
672 vec![x, x * 1.5 + 3.0, x * 0.8 - 2.0]
673 })
674 .collect();
675
676 let result = pca(
677 &data,
678 PcaConfig {
679 n_components: 1,
680 ..Default::default()
681 },
682 );
683
684 let projected = result.transform(&data);
686 assert_eq!(projected.len(), 100);
687 assert_eq!(projected[0].len(), 1);
688
689 assert!(
691 result.explained_variance_ratio[0] > 0.99,
692 "Should explain >99% variance for perfectly correlated data, got {}",
693 result.explained_variance_ratio[0]
694 );
695 }
696
697 #[test]
698 fn test_pca_centering() {
699 let offset = 1000.0;
701 let data: Vec<Vec<f64>> = (0..50)
702 .map(|i| vec![i as f64 + offset, (i as f64 * 2.0) + offset])
703 .collect();
704
705 let result = pca(
706 &data,
707 PcaConfig {
708 n_components: 2,
709 center: true,
710 ..Default::default()
711 },
712 );
713
714 assert!((result.mean[0] - (offset + 24.5)).abs() < 0.01);
716 assert!((result.mean[1] - (offset + 49.0)).abs() < 0.01);
717
718 assert!(result.explained_variance_ratio[0] > 0.99);
720 }
721
722 #[test]
723 fn test_pca_convergence() {
724 let data: Vec<Vec<f64>> = (0..20).map(|i| vec![i as f64, 0.0]).collect();
726
727 let result = pca(
728 &data,
729 PcaConfig {
730 n_components: 1,
731 max_iterations: 100,
732 tolerance: 1e-10,
733 solver: PcaSolver::PowerIteration,
734 ..Default::default()
735 },
736 );
737
738 assert!(
740 result.iterations_used < 50,
741 "Should converge quickly, used {} iterations",
742 result.iterations_used
743 );
744
745 let c0 = &result.components[0];
747 assert!(c0[0].abs() > 0.99, "Should be along x axis, got {:?}", c0);
748 assert!(
749 c0[1].abs() < 0.1,
750 "Should have near-zero y component, got {:?}",
751 c0
752 );
753 }
754
755 #[test]
756 fn test_pca_scaling() {
757 let data: Vec<Vec<f64>> = (0..100)
759 .map(|i| vec![i as f64, i as f64 * 1000.0])
760 .collect();
761
762 let result_no_scale = pca(
764 &data,
765 PcaConfig {
766 n_components: 2,
767 scale: false,
768 ..Default::default()
769 },
770 );
771 assert!(result_no_scale.components[0][1].abs() > result_no_scale.components[0][0].abs());
773
774 let result_scaled = pca(
776 &data,
777 PcaConfig {
778 n_components: 2,
779 scale: true,
780 ..Default::default()
781 },
782 );
783 let ratio = result_scaled.components[0][0].abs() / result_scaled.components[0][1].abs();
785 assert!(
786 ratio > 0.5 && ratio < 2.0,
787 "Scaled components should be balanced, ratio = {}",
788 ratio
789 );
790 }
791
792 #[test]
795 fn test_pca_randomized_basic() {
796 let data: Vec<Vec<f64>> = (0..600)
798 .map(|i| {
799 let x = i as f64;
800 vec![x, x * 2.0 + 1.0, x.sin() * 10.0, (x * 0.01).cos() * 5.0]
801 })
802 .collect();
803
804 let result = pca(
805 &data,
806 PcaConfig {
807 n_components: 2,
808 solver: PcaSolver::Randomized {
809 n_oversamples: 10,
810 n_power_iters: 4,
811 },
812 ..Default::default()
813 },
814 );
815
816 assert_eq!(result.components.len(), 2);
817 assert!(
819 result.explained_variance_ratio[0] > 0.8,
820 "Randomized SVD: first component should explain >80% variance, got {}",
821 result.explained_variance_ratio[0]
822 );
823 }
824
825 #[test]
826 fn test_pca_randomized_orthogonality() {
827 let data: Vec<Vec<f64>> = (0..600)
829 .map(|i| {
830 let x = i as f64 * 0.1;
831 vec![
832 x,
833 x * 2.0 + 1.0,
834 x.sin() * 10.0,
835 x.cos() * 5.0,
836 (x * 0.3).exp().min(100.0),
837 ]
838 })
839 .collect();
840
841 let result = pca(
842 &data,
843 PcaConfig {
844 n_components: 4,
845 solver: PcaSolver::Randomized {
846 n_oversamples: 10,
847 n_power_iters: 4,
848 },
849 ..Default::default()
850 },
851 );
852
853 for i in 0..result.components.len() {
855 for j in (i + 1)..result.components.len() {
856 let dot: f64 = result.components[i]
857 .iter()
858 .zip(result.components[j].iter())
859 .map(|(a, b)| a * b)
860 .sum();
861 assert!(
862 dot.abs() < 0.05,
863 "Components {} and {} should be orthogonal, dot product = {}",
864 i,
865 j,
866 dot
867 );
868 }
869 }
870 }
871
872 #[test]
873 fn test_pca_auto_selects_randomized() {
874 let data: Vec<Vec<f64>> = (0..600)
876 .map(|i| {
877 let x = i as f64;
878 vec![x, x * 2.0, x.sin(), x.cos()]
879 })
880 .collect();
881
882 let result = pca(
883 &data,
884 PcaConfig {
885 n_components: 2,
886 solver: PcaSolver::Auto,
887 ..Default::default()
888 },
889 );
890
891 assert!(
894 result.iterations_used <= 10,
895 "Auto should select randomized (iters={})",
896 result.iterations_used
897 );
898 assert_eq!(result.components.len(), 2);
899 }
900
901 #[test]
902 fn test_pca_solver_backward_compat() {
903 let data: Vec<Vec<f64>> = (0..100)
905 .map(|i| {
906 let x = i as f64;
907 vec![x, x * 1.5 + 3.0, x * 0.8 - 2.0]
908 })
909 .collect();
910
911 let result = pca(
912 &data,
913 PcaConfig {
914 n_components: 2,
915 solver: PcaSolver::PowerIteration,
916 ..Default::default()
917 },
918 );
919
920 assert_eq!(result.components.len(), 2);
921 assert!(
922 result.explained_variance_ratio[0] > 0.99,
923 "PowerIteration should explain >99% on correlated data, got {}",
924 result.explained_variance_ratio[0]
925 );
926
927 let dot: f64 = result.components[0]
929 .iter()
930 .zip(result.components[1].iter())
931 .map(|(a, b)| a * b)
932 .sum();
933 assert!(
934 dot.abs() < 0.01,
935 "PowerIteration components should be orthogonal, dot = {}",
936 dot
937 );
938 }
939
940 #[test]
941 fn test_pca_randomized_variance_sum() {
942 let data: Vec<Vec<f64>> = (0..600)
944 .map(|i| {
945 let x = i as f64 * 0.1;
946 vec![x, x.sin() * 10.0, (x * 0.5).cos() * 5.0]
947 })
948 .collect();
949
950 let result = pca(
951 &data,
952 PcaConfig {
953 n_components: 3,
954 solver: PcaSolver::Randomized {
955 n_oversamples: 10,
956 n_power_iters: 4,
957 },
958 ..Default::default()
959 },
960 );
961
962 let total: f64 = result.explained_variance_ratio.iter().sum();
963 assert!(
964 (total - 1.0).abs() < 0.1,
965 "Randomized SVD ratios should sum close to 1.0, got {}",
966 total
967 );
968 }
969
970 #[test]
971 fn test_pca_transform_batch() {
972 let data: Vec<Vec<f64>> = (0..100)
974 .map(|i| {
975 let x = i as f64;
976 vec![x, x * 2.0 + 1.0, x.sin() * 3.0]
977 })
978 .collect();
979
980 let result = pca(
981 &data,
982 PcaConfig {
983 n_components: 2,
984 ..Default::default()
985 },
986 );
987
988 let batch = result.transform(&data);
989 for (i, row) in data.iter().enumerate() {
990 let single = result.transform_one(row);
991 for (j, (&b, &s)) in batch[i].iter().zip(single.iter()).enumerate() {
992 assert!(
993 (b - s).abs() < 1e-10,
994 "Batch[{}][{}]={} != single={}",
995 i,
996 j,
997 b,
998 s
999 );
1000 }
1001 }
1002 }
1003}