1use ndarray::{Array1, Array2, Axis};
14use rand::Rng;
15
16#[derive(Debug, Clone)]
18pub enum PcaSolver {
19 Auto,
22 Randomized {
24 n_oversamples: usize,
26 n_power_iters: usize,
28 },
29 PowerIteration,
31}
32
33impl Default for PcaSolver {
34 fn default() -> Self {
35 PcaSolver::Auto
36 }
37}
38
39pub struct PcaConfig {
41 pub n_components: usize,
43 pub max_iterations: usize,
45 pub tolerance: f64,
47 pub center: bool,
49 pub scale: bool,
51 pub solver: PcaSolver,
53}
54
55impl Default for PcaConfig {
56 fn default() -> Self {
57 Self {
58 n_components: 2,
59 max_iterations: 100,
60 tolerance: 1e-6,
61 center: true,
62 scale: false,
63 solver: PcaSolver::Auto,
64 }
65 }
66}
67
68pub struct PcaResult {
70 pub components: Vec<Vec<f64>>,
72 pub explained_variance: Vec<f64>,
74 pub explained_variance_ratio: Vec<f64>,
76 pub mean: Vec<f64>,
78 pub std_dev: Vec<f64>,
80 pub n_samples: usize,
82 pub n_features: usize,
84 pub iterations_used: usize,
86}
87
88impl PcaResult {
89 pub fn transform(&self, data: &[Vec<f64>]) -> Vec<Vec<f64>> {
93 let n = data.len();
94 if n == 0 || self.components.is_empty() {
95 return vec![];
96 }
97 let d = self.n_features;
98 let k = self.components.len();
99
100 let comp_flat: Vec<f64> = self.components.iter().flat_map(|r| r.iter().copied()).collect();
102 let comp_mat = Array2::from_shape_vec((k, d), comp_flat).unwrap();
103
104 let data_flat: Vec<f64> = data.iter().flat_map(|row| {
106 row.iter().enumerate().map(|(j, &val)| {
107 let mut v = val - self.mean[j];
108 if self.std_dev[j] > 0.0 && self.std_dev[j] != 1.0 {
109 v /= self.std_dev[j];
110 }
111 v
112 })
113 }).collect();
114 let data_mat = Array2::from_shape_vec((n, d), data_flat).unwrap();
115
116 let projected = data_mat.dot(&comp_mat.t());
118
119 projected.rows().into_iter()
121 .map(|row| row.to_vec())
122 .collect()
123 }
124
125 pub fn transform_one(&self, point: &[f64]) -> Vec<f64> {
127 let k = self.components.len();
128 let d = self.n_features;
129 let mut result = vec![0.0; k];
130
131 for (c, component) in self.components.iter().enumerate() {
132 let mut dot = 0.0;
133 for j in 0..d {
134 let mut val = point[j] - self.mean[j];
135 if self.std_dev[j] > 0.0 && self.std_dev[j] != 1.0 {
136 val /= self.std_dev[j];
137 }
138 dot += val * component[j];
139 }
140 result[c] = dot;
141 }
142 result
143 }
144}
145
146pub fn pca(data: &[Vec<f64>], config: PcaConfig) -> PcaResult {
155 let n = data.len();
156 assert!(n > 0, "PCA requires at least one data point");
157 let d = data[0].len();
158 assert!(d > 0, "PCA requires at least one feature");
159
160 let k = config.n_components.min(d).min(n);
161
162 let flat: Vec<f64> = data.iter().flat_map(|row| {
164 assert_eq!(row.len(), d, "All rows must have the same number of features");
165 row.iter().copied()
166 }).collect();
167 let mut mat = Array2::from_shape_vec((n, d), flat).unwrap();
168
169 let mut mean = vec![0.0; d];
171 if config.center {
172 let mean_arr = mat.mean_axis(Axis(0)).unwrap();
173 mat -= &mean_arr;
174 mean = mean_arr.to_vec();
175 }
176
177 let mut std_dev = vec![1.0; d];
179 if config.scale {
180 for j in 0..d {
181 let col = mat.column(j);
182 let ss: f64 = col.iter().map(|x| x * x).sum();
183 let s = (ss / (n.max(2) - 1) as f64).sqrt();
184 std_dev[j] = s;
185 if s > 0.0 {
186 mat.column_mut(j).mapv_inplace(|x| x / s);
187 }
188 }
189 }
190
191 let use_randomized = match &config.solver {
193 PcaSolver::Auto => {
194 let min_dim = n.min(d);
195 n > 500 && (k as f64) < 0.8 * min_dim as f64
196 }
197 PcaSolver::Randomized { .. } => true,
198 PcaSolver::PowerIteration => false,
199 };
200
201 let (n_oversamples, n_power_iters) = match &config.solver {
202 PcaSolver::Randomized { n_oversamples, n_power_iters } => (*n_oversamples, *n_power_iters),
203 _ => (10, 4),
204 };
205
206 if use_randomized {
207 let (components, singular_values) = randomized_svd(&mat, k, n_oversamples, n_power_iters);
209
210 let denom = if n > 1 { (n - 1) as f64 } else { 1.0 };
212 let eigenvalues: Vec<f64> = singular_values.iter().map(|&s| s * s / denom).collect();
213
214 let total_variance = compute_total_variance(&mat, denom);
216
217 let explained_variance_ratio: Vec<f64> = eigenvalues
218 .iter()
219 .map(|&ev| if total_variance > 0.0 { ev / total_variance } else { 0.0 })
220 .collect();
221
222 PcaResult {
223 components,
224 explained_variance: eigenvalues,
225 explained_variance_ratio,
226 mean,
227 std_dev,
228 n_samples: n,
229 n_features: d,
230 iterations_used: n_power_iters,
231 }
232 } else {
233 let xt = mat.t();
235 let denom = if n > 1 { (n - 1) as f64 } else { 1.0 };
236 let mut cov = xt.dot(&mat) / denom;
237
238 let total_variance: f64 = (0..d).map(|i| cov[[i, i]]).sum();
240
241 let mut components: Vec<Vec<f64>> = Vec::with_capacity(k);
242 let mut eigenvalues: Vec<f64> = Vec::with_capacity(k);
243 let mut last_iters = 0;
244
245 for _ in 0..k {
246 let (eigvec, _eigval, iters) = power_iteration(&cov, config.max_iterations, config.tolerance);
247 last_iters = iters;
248
249 let mut new_vec = eigvec;
251 for prev in &components {
252 let dot: f64 = new_vec.iter().zip(prev).map(|(a, b)| a * b).sum();
253 for (v, p) in new_vec.iter_mut().zip(prev) {
254 *v -= dot * p;
255 }
256 }
257 let norm: f64 = new_vec.iter().map(|x| x * x).sum::<f64>().sqrt();
259 if norm > 1e-15 {
260 for v in &mut new_vec {
261 *v /= norm;
262 }
263 }
264
265 let v_arr = Array1::from(new_vec.clone());
267 let cv = cov.dot(&v_arr);
268 let eigval = v_arr.dot(&cv);
269
270 for r in 0..d {
272 for c in 0..d {
273 cov[[r, c]] -= eigval * new_vec[r] * new_vec[c];
274 }
275 }
276
277 components.push(new_vec);
278 eigenvalues.push(eigval);
279 }
280
281 let explained_variance_ratio: Vec<f64> = eigenvalues
282 .iter()
283 .map(|&ev| if total_variance > 0.0 { ev / total_variance } else { 0.0 })
284 .collect();
285
286 PcaResult {
287 components,
288 explained_variance: eigenvalues,
289 explained_variance_ratio,
290 mean,
291 std_dev,
292 n_samples: n,
293 n_features: d,
294 iterations_used: last_iters,
295 }
296 }
297}
298
299fn compute_total_variance(mat: &Array2<f64>, denom: f64) -> f64 {
301 let d = mat.ncols();
302 let mut total = 0.0;
303 for j in 0..d {
304 let col = mat.column(j);
305 let ss: f64 = col.iter().map(|x| x * x).sum();
306 total += ss / denom;
307 }
308 total
309}
310
311fn randomized_svd(
326 mat: &Array2<f64>, k: usize, n_oversamples: usize, n_power_iters: usize, ) -> (Vec<Vec<f64>>, Vec<f64>) {
331 let n = mat.nrows();
332 let d = mat.ncols();
333 let l = (k + n_oversamples).min(n).min(d); let mut rng = rand::thread_rng();
337 let omega_flat: Vec<f64> = (0..d * l).map(|_| rng.sample::<f64, _>(rand::distributions::Standard)).collect();
338 let omega = Array2::from_shape_vec((d, l), omega_flat).unwrap();
339
340 let mut y = mat.dot(&omega);
342
343 for _ in 0..n_power_iters {
346 qr_modified_gram_schmidt(&mut y);
348 let xty = mat.t().dot(&y); y = mat.dot(&xty); }
352
353 qr_modified_gram_schmidt(&mut y);
355 let q = y; let b = q.t().dot(mat);
359
360 let bbt = b.dot(&b.t());
362 let l_actual = bbt.nrows();
363
364 let (eigvecs_left, eigvals) = symmetric_eigen(&bbt, l_actual);
367
368 let mut components = Vec::with_capacity(k);
371 let mut singular_values = Vec::with_capacity(k);
372
373 for i in 0..k.min(l_actual) {
374 let sigma_sq = eigvals[i];
375 if sigma_sq < 1e-30 {
376 break;
377 }
378 let sigma = sigma_sq.sqrt();
379 singular_values.push(sigma);
380
381 let u_i = eigvecs_left.column(i);
383 let v_i = b.t().dot(&u_i) / sigma;
385 components.push(v_i.to_vec());
386 }
387
388 (components, singular_values)
389}
390
391fn symmetric_eigen(mat: &Array2<f64>, k: usize) -> (Array2<f64>, Vec<f64>) {
395 let d = mat.nrows();
396 let mut deflated = mat.clone();
397 let mut eigvecs = Array2::<f64>::zeros((d, k));
398 let mut eigvals = Vec::with_capacity(k);
399
400 for i in 0..k {
401 let (vec, _val, _) = power_iteration(&deflated, 200, 1e-12);
402
403 let mut v = Array1::from(vec);
405 for j in 0..i {
406 let prev = eigvecs.column(j);
407 let dot = prev.dot(&v);
408 v -= &(&prev * dot);
409 }
410 let norm = v.dot(&v).sqrt();
411 if norm > 1e-15 {
412 v /= norm;
413 }
414
415 let av = deflated.dot(&v);
417 let val = v.dot(&av);
418
419 for r in 0..d {
421 for c in 0..d {
422 deflated[[r, c]] -= val * v[r] * v[c];
423 }
424 }
425
426 eigvecs.column_mut(i).assign(&v);
427 eigvals.push(val);
428 }
429
430 (eigvecs, eigvals)
431}
432
433fn qr_modified_gram_schmidt(mat: &mut Array2<f64>) {
441 let ncols = mat.ncols();
442 for j in 0..ncols {
443 let mut col_j = mat.column(j).to_owned();
444 for i in 0..j {
445 let col_i = mat.column(i).to_owned();
446 let r = col_i.dot(&col_j);
447 col_j -= &(&col_i * r);
448 }
449 let norm = col_j.dot(&col_j).sqrt();
450 if norm > 1e-15 {
451 col_j /= norm;
452 }
453 mat.column_mut(j).assign(&col_j);
454 }
455}
456
457fn power_iteration(
465 matrix: &Array2<f64>,
466 max_iters: usize,
467 tolerance: f64,
468) -> (Vec<f64>, f64, usize) {
469 let d = matrix.nrows();
470
471 let mut v = Array1::<f64>::zeros(d);
473 for i in 0..d {
474 v[i] = ((i + 1) as f64).sqrt();
475 }
476 let norm = v.dot(&v).sqrt();
477 if norm > 0.0 {
478 v /= norm;
479 }
480
481 let mut iters = 0;
482 for iter in 0..max_iters {
483 iters = iter + 1;
484
485 let w = matrix.dot(&v);
487
488 let w_norm = w.dot(&w).sqrt();
490 if w_norm < 1e-15 {
491 break;
493 }
494 let v_new = &w / w_norm;
495
496 let diff_pos: f64 = v_new.iter().zip(v.iter()).map(|(a, b)| (a - b).powi(2)).sum();
498 let diff_neg: f64 = v_new.iter().zip(v.iter()).map(|(a, b)| (a + b).powi(2)).sum();
499 let diff = diff_pos.min(diff_neg).sqrt();
500
501 v = v_new;
502
503 if diff < tolerance {
504 break;
505 }
506 }
507
508 let cv = matrix.dot(&v);
510 let eigenvalue = v.dot(&cv);
511
512 (v.to_vec(), eigenvalue, iters)
513}
514
515#[cfg(test)]
516mod tests {
517 use super::*;
518
519 #[test]
520 fn test_pca_basic() {
521 let data: Vec<Vec<f64>> = (0..100)
523 .map(|i| {
524 let x = i as f64;
525 let y = x + (i as f64 * 0.1).sin() * 2.0; vec![x, y]
527 })
528 .collect();
529
530 let result = pca(&data, PcaConfig {
531 n_components: 2,
532 ..Default::default()
533 });
534
535 assert_eq!(result.components.len(), 2);
536 assert_eq!(result.explained_variance.len(), 2);
537
538 assert!(result.explained_variance_ratio[0] > 0.9,
540 "First component should explain >90% variance, got {}",
541 result.explained_variance_ratio[0]);
542
543 let c0 = &result.components[0];
545 let angle = (c0[0].abs() - c0[1].abs()).abs();
546 assert!(angle < 0.2, "First component should be near diagonal, got {:?}", c0);
547 }
548
549 #[test]
550 fn test_pca_identity() {
551 let data: Vec<Vec<f64>> = (0..50)
553 .map(|i| {
554 vec![
555 i as f64,
556 (i as f64 * 0.5).sin() * 10.0,
557 (i * i) as f64 % 17.0,
558 ]
559 })
560 .collect();
561
562 let result = pca(&data, PcaConfig {
563 n_components: 3,
564 ..Default::default()
565 });
566
567 let total: f64 = result.explained_variance_ratio.iter().sum();
568 assert!((total - 1.0).abs() < 0.01,
569 "Total explained variance ratio should be ~1.0, got {}", total);
570 }
571
572 #[test]
573 fn test_pca_explained_variance_ratios_sum() {
574 let data: Vec<Vec<f64>> = (0..200)
575 .map(|i| {
576 let x = i as f64 * 0.1;
577 vec![x, x * 2.0 + 1.0, x.sin(), x.cos(), (x * 0.3).exp().min(100.0)]
578 })
579 .collect();
580
581 let result = pca(&data, PcaConfig {
582 n_components: 5,
583 ..Default::default()
584 });
585
586 let total: f64 = result.explained_variance_ratio.iter().sum();
587 assert!((total - 1.0).abs() < 0.05,
588 "Ratios should sum to ~1.0, got {}", total);
589
590 for i in 1..result.explained_variance_ratio.len() {
592 assert!(result.explained_variance_ratio[i] <= result.explained_variance_ratio[i - 1] + 1e-10,
593 "Ratios should be descending");
594 }
595 }
596
597 #[test]
598 fn test_pca_transform() {
599 let data: Vec<Vec<f64>> = (0..100)
601 .map(|i| {
602 let x = i as f64;
603 vec![x, x * 1.5 + 3.0, x * 0.8 - 2.0]
604 })
605 .collect();
606
607 let result = pca(&data, PcaConfig {
608 n_components: 1,
609 ..Default::default()
610 });
611
612 let projected = result.transform(&data);
614 assert_eq!(projected.len(), 100);
615 assert_eq!(projected[0].len(), 1);
616
617 assert!(result.explained_variance_ratio[0] > 0.99,
619 "Should explain >99% variance for perfectly correlated data, got {}",
620 result.explained_variance_ratio[0]);
621 }
622
623 #[test]
624 fn test_pca_centering() {
625 let offset = 1000.0;
627 let data: Vec<Vec<f64>> = (0..50)
628 .map(|i| vec![i as f64 + offset, (i as f64 * 2.0) + offset])
629 .collect();
630
631 let result = pca(&data, PcaConfig {
632 n_components: 2,
633 center: true,
634 ..Default::default()
635 });
636
637 assert!((result.mean[0] - (offset + 24.5)).abs() < 0.01);
639 assert!((result.mean[1] - (offset + 49.0)).abs() < 0.01);
640
641 assert!(result.explained_variance_ratio[0] > 0.99);
643 }
644
645 #[test]
646 fn test_pca_convergence() {
647 let data: Vec<Vec<f64>> = (0..20)
649 .map(|i| vec![i as f64, 0.0])
650 .collect();
651
652 let result = pca(&data, PcaConfig {
653 n_components: 1,
654 max_iterations: 100,
655 tolerance: 1e-10,
656 solver: PcaSolver::PowerIteration,
657 ..Default::default()
658 });
659
660 assert!(result.iterations_used < 50,
662 "Should converge quickly, used {} iterations", result.iterations_used);
663
664 let c0 = &result.components[0];
666 assert!(c0[0].abs() > 0.99, "Should be along x axis, got {:?}", c0);
667 assert!(c0[1].abs() < 0.1, "Should have near-zero y component, got {:?}", c0);
668 }
669
670 #[test]
671 fn test_pca_scaling() {
672 let data: Vec<Vec<f64>> = (0..100)
674 .map(|i| vec![i as f64, i as f64 * 1000.0])
675 .collect();
676
677 let result_no_scale = pca(&data, PcaConfig {
679 n_components: 2,
680 scale: false,
681 ..Default::default()
682 });
683 assert!(result_no_scale.components[0][1].abs() > result_no_scale.components[0][0].abs());
685
686 let result_scaled = pca(&data, PcaConfig {
688 n_components: 2,
689 scale: true,
690 ..Default::default()
691 });
692 let ratio = result_scaled.components[0][0].abs() / result_scaled.components[0][1].abs();
694 assert!(ratio > 0.5 && ratio < 2.0,
695 "Scaled components should be balanced, ratio = {}", ratio);
696 }
697
698 #[test]
701 fn test_pca_randomized_basic() {
702 let data: Vec<Vec<f64>> = (0..600)
704 .map(|i| {
705 let x = i as f64;
706 vec![x, x * 2.0 + 1.0, x.sin() * 10.0, (x * 0.01).cos() * 5.0]
707 })
708 .collect();
709
710 let result = pca(&data, PcaConfig {
711 n_components: 2,
712 solver: PcaSolver::Randomized { n_oversamples: 10, n_power_iters: 4 },
713 ..Default::default()
714 });
715
716 assert_eq!(result.components.len(), 2);
717 assert!(result.explained_variance_ratio[0] > 0.8,
719 "Randomized SVD: first component should explain >80% variance, got {}",
720 result.explained_variance_ratio[0]);
721 }
722
723 #[test]
724 fn test_pca_randomized_orthogonality() {
725 let data: Vec<Vec<f64>> = (0..600)
727 .map(|i| {
728 let x = i as f64 * 0.1;
729 vec![x, x * 2.0 + 1.0, x.sin() * 10.0, x.cos() * 5.0, (x * 0.3).exp().min(100.0)]
730 })
731 .collect();
732
733 let result = pca(&data, PcaConfig {
734 n_components: 4,
735 solver: PcaSolver::Randomized { n_oversamples: 10, n_power_iters: 4 },
736 ..Default::default()
737 });
738
739 for i in 0..result.components.len() {
741 for j in (i + 1)..result.components.len() {
742 let dot: f64 = result.components[i].iter()
743 .zip(result.components[j].iter())
744 .map(|(a, b)| a * b)
745 .sum();
746 assert!(dot.abs() < 0.05,
747 "Components {} and {} should be orthogonal, dot product = {}", i, j, dot);
748 }
749 }
750 }
751
752 #[test]
753 fn test_pca_auto_selects_randomized() {
754 let data: Vec<Vec<f64>> = (0..600)
756 .map(|i| {
757 let x = i as f64;
758 vec![x, x * 2.0, x.sin(), x.cos()]
759 })
760 .collect();
761
762 let result = pca(&data, PcaConfig {
763 n_components: 2,
764 solver: PcaSolver::Auto,
765 ..Default::default()
766 });
767
768 assert!(result.iterations_used <= 10,
771 "Auto should select randomized (iters={})", result.iterations_used);
772 assert_eq!(result.components.len(), 2);
773 }
774
775 #[test]
776 fn test_pca_solver_backward_compat() {
777 let data: Vec<Vec<f64>> = (0..100)
779 .map(|i| {
780 let x = i as f64;
781 vec![x, x * 1.5 + 3.0, x * 0.8 - 2.0]
782 })
783 .collect();
784
785 let result = pca(&data, PcaConfig {
786 n_components: 2,
787 solver: PcaSolver::PowerIteration,
788 ..Default::default()
789 });
790
791 assert_eq!(result.components.len(), 2);
792 assert!(result.explained_variance_ratio[0] > 0.99,
793 "PowerIteration should explain >99% on correlated data, got {}",
794 result.explained_variance_ratio[0]);
795
796 let dot: f64 = result.components[0].iter()
798 .zip(result.components[1].iter())
799 .map(|(a, b)| a * b)
800 .sum();
801 assert!(dot.abs() < 0.01,
802 "PowerIteration components should be orthogonal, dot = {}", dot);
803 }
804
805 #[test]
806 fn test_pca_randomized_variance_sum() {
807 let data: Vec<Vec<f64>> = (0..600)
809 .map(|i| {
810 let x = i as f64 * 0.1;
811 vec![x, x.sin() * 10.0, (x * 0.5).cos() * 5.0]
812 })
813 .collect();
814
815 let result = pca(&data, PcaConfig {
816 n_components: 3,
817 solver: PcaSolver::Randomized { n_oversamples: 10, n_power_iters: 4 },
818 ..Default::default()
819 });
820
821 let total: f64 = result.explained_variance_ratio.iter().sum();
822 assert!((total - 1.0).abs() < 0.1,
823 "Randomized SVD ratios should sum close to 1.0, got {}", total);
824 }
825
826 #[test]
827 fn test_pca_transform_batch() {
828 let data: Vec<Vec<f64>> = (0..100)
830 .map(|i| {
831 let x = i as f64;
832 vec![x, x * 2.0 + 1.0, x.sin() * 3.0]
833 })
834 .collect();
835
836 let result = pca(&data, PcaConfig {
837 n_components: 2,
838 ..Default::default()
839 });
840
841 let batch = result.transform(&data);
842 for (i, row) in data.iter().enumerate() {
843 let single = result.transform_one(row);
844 for (j, (&b, &s)) in batch[i].iter().zip(single.iter()).enumerate() {
845 assert!((b - s).abs() < 1e-10,
846 "Batch[{}][{}]={} != single={}", i, j, b, s);
847 }
848 }
849 }
850}