samyama_graph_algorithms/
pca.rs

1//! Principal Component Analysis (PCA)
2//!
3//! Implements dimensionality reduction for node feature matrices.
4//!
5//! Two solvers are available:
6//! - **Randomized SVD** (default): Halko-Martinsson-Tropp algorithm. O(n·d·k),
7//!   numerically stable, automatic orthogonality. Industry standard (scikit-learn,
8//!   cuML, Spark MLlib).
9//! - **Power Iteration** (legacy): Extract one eigenvector at a time from the
10//!   covariance matrix, then deflate and repeat. With Gram-Schmidt
11//!   re-orthogonalization for stability.
12
13use ndarray::{Array1, Array2, Axis};
14use rand::Rng;
15
16/// Solver strategy for PCA.
17#[derive(Debug, Clone)]
18pub enum PcaSolver {
19    /// Automatically select solver based on data size:
20    /// Randomized if n > 500 and k < 0.8 * min(n, d), else PowerIteration.
21    Auto,
22    /// Randomized SVD (Halko-Martinsson-Tropp).
23    Randomized {
24        /// Extra columns for accuracy (default: 10).
25        n_oversamples: usize,
26        /// Subspace power iterations for stability (default: 4).
27        n_power_iters: usize,
28    },
29    /// Legacy power iteration with deflation.
30    PowerIteration,
31}
32
33impl Default for PcaSolver {
34    fn default() -> Self {
35        PcaSolver::Auto
36    }
37}
38
39/// PCA configuration
40pub struct PcaConfig {
41    /// Number of principal components to extract
42    pub n_components: usize,
43    /// Maximum power iterations per component (only used for PowerIteration solver)
44    pub max_iterations: usize,
45    /// Convergence tolerance for power iteration (only used for PowerIteration solver)
46    pub tolerance: f64,
47    /// Subtract column means before PCA (default: true)
48    pub center: bool,
49    /// Divide by column std dev before PCA (default: false)
50    pub scale: bool,
51    /// Solver strategy (default: Auto)
52    pub solver: PcaSolver,
53}
54
55impl Default for PcaConfig {
56    fn default() -> Self {
57        Self {
58            n_components: 2,
59            max_iterations: 100,
60            tolerance: 1e-6,
61            center: true,
62            scale: false,
63            solver: PcaSolver::Auto,
64        }
65    }
66}
67
68/// PCA result containing components and explained variance
69pub struct PcaResult {
70    /// Principal component vectors (n_components x n_features), row-major
71    pub components: Vec<Vec<f64>>,
72    /// Variance explained by each component (eigenvalues)
73    pub explained_variance: Vec<f64>,
74    /// Proportion of total variance explained by each component
75    pub explained_variance_ratio: Vec<f64>,
76    /// Feature means used for centering (needed to project new data)
77    pub mean: Vec<f64>,
78    /// Feature standard deviations (if scaling was used)
79    pub std_dev: Vec<f64>,
80    /// Number of samples in the input data
81    pub n_samples: usize,
82    /// Number of features in the input data
83    pub n_features: usize,
84    /// Number of iterations used (last component for PowerIteration, power iters for Randomized)
85    pub iterations_used: usize,
86}
87
88impl PcaResult {
89    /// Project multiple data points into the reduced PCA space.
90    ///
91    /// Uses ndarray matrix multiplication for efficiency.
92    pub fn transform(&self, data: &[Vec<f64>]) -> Vec<Vec<f64>> {
93        let n = data.len();
94        if n == 0 || self.components.is_empty() {
95            return vec![];
96        }
97        let d = self.n_features;
98        let k = self.components.len();
99
100        // Build component matrix (k × d)
101        let comp_flat: Vec<f64> = self.components.iter().flat_map(|r| r.iter().copied()).collect();
102        let comp_mat = Array2::from_shape_vec((k, d), comp_flat).unwrap();
103
104        // Build centered data matrix (n × d)
105        let data_flat: Vec<f64> = data.iter().flat_map(|row| {
106            row.iter().enumerate().map(|(j, &val)| {
107                let mut v = val - self.mean[j];
108                if self.std_dev[j] > 0.0 && self.std_dev[j] != 1.0 {
109                    v /= self.std_dev[j];
110                }
111                v
112            })
113        }).collect();
114        let data_mat = Array2::from_shape_vec((n, d), data_flat).unwrap();
115
116        // projected = data_mat @ comp_mat^T → (n × k)
117        let projected = data_mat.dot(&comp_mat.t());
118
119        // Convert back to Vec<Vec<f64>>
120        projected.rows().into_iter()
121            .map(|row| row.to_vec())
122            .collect()
123    }
124
125    /// Project a single data point into the reduced PCA space.
126    pub fn transform_one(&self, point: &[f64]) -> Vec<f64> {
127        let k = self.components.len();
128        let d = self.n_features;
129        let mut result = vec![0.0; k];
130
131        for (c, component) in self.components.iter().enumerate() {
132            let mut dot = 0.0;
133            for j in 0..d {
134                let mut val = point[j] - self.mean[j];
135                if self.std_dev[j] > 0.0 && self.std_dev[j] != 1.0 {
136                    val /= self.std_dev[j];
137                }
138                dot += val * component[j];
139            }
140            result[c] = dot;
141        }
142        result
143    }
144}
145
146/// Run PCA on a feature matrix.
147///
148/// # Arguments
149/// - `data`: slice of rows, each row is a feature vector of equal length
150/// - `config`: PCA parameters (including solver selection)
151///
152/// # Panics
153/// Panics if `data` is empty or rows have inconsistent lengths.
154pub fn pca(data: &[Vec<f64>], config: PcaConfig) -> PcaResult {
155    let n = data.len();
156    assert!(n > 0, "PCA requires at least one data point");
157    let d = data[0].len();
158    assert!(d > 0, "PCA requires at least one feature");
159
160    let k = config.n_components.min(d).min(n);
161
162    // Build ndarray matrix from flat data (cache-friendly)
163    let flat: Vec<f64> = data.iter().flat_map(|row| {
164        assert_eq!(row.len(), d, "All rows must have the same number of features");
165        row.iter().copied()
166    }).collect();
167    let mut mat = Array2::from_shape_vec((n, d), flat).unwrap();
168
169    // Compute column means and center using ndarray operations
170    let mut mean = vec![0.0; d];
171    if config.center {
172        let mean_arr = mat.mean_axis(Axis(0)).unwrap();
173        mat -= &mean_arr;
174        mean = mean_arr.to_vec();
175    }
176
177    // Compute column std devs and scale
178    let mut std_dev = vec![1.0; d];
179    if config.scale {
180        for j in 0..d {
181            let col = mat.column(j);
182            let ss: f64 = col.iter().map(|x| x * x).sum();
183            let s = (ss / (n.max(2) - 1) as f64).sqrt();
184            std_dev[j] = s;
185            if s > 0.0 {
186                mat.column_mut(j).mapv_inplace(|x| x / s);
187            }
188        }
189    }
190
191    // Decide solver
192    let use_randomized = match &config.solver {
193        PcaSolver::Auto => {
194            let min_dim = n.min(d);
195            n > 500 && (k as f64) < 0.8 * min_dim as f64
196        }
197        PcaSolver::Randomized { .. } => true,
198        PcaSolver::PowerIteration => false,
199    };
200
201    let (n_oversamples, n_power_iters) = match &config.solver {
202        PcaSolver::Randomized { n_oversamples, n_power_iters } => (*n_oversamples, *n_power_iters),
203        _ => (10, 4),
204    };
205
206    if use_randomized {
207        // ── Randomized SVD path ──
208        let (components, singular_values) = randomized_svd(&mat, k, n_oversamples, n_power_iters);
209
210        // Eigenvalues (variance) = σ² / (n-1)
211        let denom = if n > 1 { (n - 1) as f64 } else { 1.0 };
212        let eigenvalues: Vec<f64> = singular_values.iter().map(|&s| s * s / denom).collect();
213
214        // Total variance from covariance diagonal
215        let total_variance = compute_total_variance(&mat, denom);
216
217        let explained_variance_ratio: Vec<f64> = eigenvalues
218            .iter()
219            .map(|&ev| if total_variance > 0.0 { ev / total_variance } else { 0.0 })
220            .collect();
221
222        PcaResult {
223            components,
224            explained_variance: eigenvalues,
225            explained_variance_ratio,
226            mean,
227            std_dev,
228            n_samples: n,
229            n_features: d,
230            iterations_used: n_power_iters,
231        }
232    } else {
233        // ── Power iteration with deflation path ──
234        let xt = mat.t();
235        let denom = if n > 1 { (n - 1) as f64 } else { 1.0 };
236        let mut cov = xt.dot(&mat) / denom;
237
238        // Capture total variance BEFORE deflation
239        let total_variance: f64 = (0..d).map(|i| cov[[i, i]]).sum();
240
241        let mut components: Vec<Vec<f64>> = Vec::with_capacity(k);
242        let mut eigenvalues: Vec<f64> = Vec::with_capacity(k);
243        let mut last_iters = 0;
244
245        for _ in 0..k {
246            let (eigvec, _eigval, iters) = power_iteration(&cov, config.max_iterations, config.tolerance);
247            last_iters = iters;
248
249            // Gram-Schmidt re-orthogonalization against previous components
250            let mut new_vec = eigvec;
251            for prev in &components {
252                let dot: f64 = new_vec.iter().zip(prev).map(|(a, b)| a * b).sum();
253                for (v, p) in new_vec.iter_mut().zip(prev) {
254                    *v -= dot * p;
255                }
256            }
257            // Re-normalize after orthogonalization
258            let norm: f64 = new_vec.iter().map(|x| x * x).sum::<f64>().sqrt();
259            if norm > 1e-15 {
260                for v in &mut new_vec {
261                    *v /= norm;
262                }
263            }
264
265            // Recompute eigenvalue after orthogonalization: v^T C v
266            let v_arr = Array1::from(new_vec.clone());
267            let cv = cov.dot(&v_arr);
268            let eigval = v_arr.dot(&cv);
269
270            // Deflate: C = C - lambda * v * v^T
271            for r in 0..d {
272                for c in 0..d {
273                    cov[[r, c]] -= eigval * new_vec[r] * new_vec[c];
274                }
275            }
276
277            components.push(new_vec);
278            eigenvalues.push(eigval);
279        }
280
281        let explained_variance_ratio: Vec<f64> = eigenvalues
282            .iter()
283            .map(|&ev| if total_variance > 0.0 { ev / total_variance } else { 0.0 })
284            .collect();
285
286        PcaResult {
287            components,
288            explained_variance: eigenvalues,
289            explained_variance_ratio,
290            mean,
291            std_dev,
292            n_samples: n,
293            n_features: d,
294            iterations_used: last_iters,
295        }
296    }
297}
298
299/// Compute total variance from centered data matrix: sum of column variances.
300fn compute_total_variance(mat: &Array2<f64>, denom: f64) -> f64 {
301    let d = mat.ncols();
302    let mut total = 0.0;
303    for j in 0..d {
304        let col = mat.column(j);
305        let ss: f64 = col.iter().map(|x| x * x).sum();
306        total += ss / denom;
307    }
308    total
309}
310
311// ============================================================
312// Randomized SVD (Halko-Martinsson-Tropp)
313// ============================================================
314
315/// Randomized SVD: find top-k singular vectors of a centered data matrix.
316///
317/// Algorithm (Algorithm 5.1 from Halko et al. 2011):
318/// 1. Generate random Gaussian matrix Omega (d × (k+p))
319/// 2. Y = X @ Omega
320/// 3. Power iteration for stability: Y = (X @ X^T)^q @ Y, with QR between steps
321/// 4. QR factorization of Y → Q
322/// 5. B = Q^T @ X (small: (k+p) × d)
323/// 6. SVD of B via eigendecomposition of B @ B^T
324/// 7. Return top-k right singular vectors and singular values
325fn randomized_svd(
326    mat: &Array2<f64>,       // n × d, already centered
327    k: usize,                // number of components
328    n_oversamples: usize,    // oversampling parameter
329    n_power_iters: usize,    // power iterations for accuracy
330) -> (Vec<Vec<f64>>, Vec<f64>) {
331    let n = mat.nrows();
332    let d = mat.ncols();
333    let l = (k + n_oversamples).min(n).min(d); // sketch width
334
335    // Stage 1: Random projection
336    let mut rng = rand::thread_rng();
337    let omega_flat: Vec<f64> = (0..d * l).map(|_| rng.sample::<f64, _>(rand::distributions::Standard)).collect();
338    let omega = Array2::from_shape_vec((d, l), omega_flat).unwrap();
339
340    // Y = X @ Omega  (n × l)
341    let mut y = mat.dot(&omega);
342
343    // Power iteration for stability: repeat q times
344    // Y = X @ (X^T @ Y), then QR factorize Y
345    for _ in 0..n_power_iters {
346        // QR factorize Y to maintain numerical stability
347        qr_modified_gram_schmidt(&mut y);
348        // Y = X @ (X^T @ Y)
349        let xty = mat.t().dot(&y); // d × l
350        y = mat.dot(&xty);         // n × l
351    }
352
353    // Final QR to get orthonormal basis Q
354    qr_modified_gram_schmidt(&mut y);
355    let q = y; // n × l, orthonormal columns
356
357    // Stage 2: Form small matrix B = Q^T @ X  (l × d)
358    let b = q.t().dot(mat);
359
360    // SVD of B via eigendecomposition of B @ B^T  (l × l, small)
361    let bbt = b.dot(&b.t());
362    let l_actual = bbt.nrows();
363
364    // Eigen-decompose the small l×l matrix using power iteration
365    // (we need all l eigenvectors, but we only keep top k)
366    let (eigvecs_left, eigvals) = symmetric_eigen(&bbt, l_actual);
367
368    // Right singular vectors of B: V_i = B^T @ U_i / σ_i
369    // Components are the right singular vectors of X (rows of V^T)
370    let mut components = Vec::with_capacity(k);
371    let mut singular_values = Vec::with_capacity(k);
372
373    for i in 0..k.min(l_actual) {
374        let sigma_sq = eigvals[i];
375        if sigma_sq < 1e-30 {
376            break;
377        }
378        let sigma = sigma_sq.sqrt();
379        singular_values.push(sigma);
380
381        // u_i = eigvecs_left column i
382        let u_i = eigvecs_left.column(i);
383        // v_i = B^T @ u_i / sigma
384        let v_i = b.t().dot(&u_i) / sigma;
385        components.push(v_i.to_vec());
386    }
387
388    (components, singular_values)
389}
390
391/// Symmetric eigendecomposition of a small matrix using power iteration + deflation.
392///
393/// Returns (eigenvectors as columns of Array2, eigenvalues sorted descending).
394fn symmetric_eigen(mat: &Array2<f64>, k: usize) -> (Array2<f64>, Vec<f64>) {
395    let d = mat.nrows();
396    let mut deflated = mat.clone();
397    let mut eigvecs = Array2::<f64>::zeros((d, k));
398    let mut eigvals = Vec::with_capacity(k);
399
400    for i in 0..k {
401        let (vec, _val, _) = power_iteration(&deflated, 200, 1e-12);
402
403        // Gram-Schmidt against previous eigenvectors
404        let mut v = Array1::from(vec);
405        for j in 0..i {
406            let prev = eigvecs.column(j);
407            let dot = prev.dot(&v);
408            v -= &(&prev * dot);
409        }
410        let norm = v.dot(&v).sqrt();
411        if norm > 1e-15 {
412            v /= norm;
413        }
414
415        // Recompute eigenvalue after orthogonalization
416        let av = deflated.dot(&v);
417        let val = v.dot(&av);
418
419        // Deflate
420        for r in 0..d {
421            for c in 0..d {
422                deflated[[r, c]] -= val * v[r] * v[c];
423            }
424        }
425
426        eigvecs.column_mut(i).assign(&v);
427        eigvals.push(val);
428    }
429
430    (eigvecs, eigvals)
431}
432
433// ============================================================
434// QR factorization (Modified Gram-Schmidt)
435// ============================================================
436
437/// Modified Gram-Schmidt QR factorization in-place.
438///
439/// Replaces columns of `mat` with orthonormal vectors spanning the same space.
440fn qr_modified_gram_schmidt(mat: &mut Array2<f64>) {
441    let ncols = mat.ncols();
442    for j in 0..ncols {
443        let mut col_j = mat.column(j).to_owned();
444        for i in 0..j {
445            let col_i = mat.column(i).to_owned();
446            let r = col_i.dot(&col_j);
447            col_j -= &(&col_i * r);
448        }
449        let norm = col_j.dot(&col_j).sqrt();
450        if norm > 1e-15 {
451            col_j /= norm;
452        }
453        mat.column_mut(j).assign(&col_j);
454    }
455}
456
457// ============================================================
458// Power iteration (used by both solvers and internal eigen)
459// ============================================================
460
461/// Power iteration: find the dominant eigenvector of a symmetric matrix.
462///
463/// Returns (eigenvector, eigenvalue, iterations_used).
464fn power_iteration(
465    matrix: &Array2<f64>,
466    max_iters: usize,
467    tolerance: f64,
468) -> (Vec<f64>, f64, usize) {
469    let d = matrix.nrows();
470
471    // Initialize with a vector that has some structure to avoid degenerate starts
472    let mut v = Array1::<f64>::zeros(d);
473    for i in 0..d {
474        v[i] = ((i + 1) as f64).sqrt();
475    }
476    let norm = v.dot(&v).sqrt();
477    if norm > 0.0 {
478        v /= norm;
479    }
480
481    let mut iters = 0;
482    for iter in 0..max_iters {
483        iters = iter + 1;
484
485        // w = C * v
486        let w = matrix.dot(&v);
487
488        // Normalize
489        let w_norm = w.dot(&w).sqrt();
490        if w_norm < 1e-15 {
491            // Matrix has zero eigenvalue in this direction
492            break;
493        }
494        let v_new = &w / w_norm;
495
496        // Check convergence: |v_new - v| or |v_new + v| (sign may flip)
497        let diff_pos: f64 = v_new.iter().zip(v.iter()).map(|(a, b)| (a - b).powi(2)).sum();
498        let diff_neg: f64 = v_new.iter().zip(v.iter()).map(|(a, b)| (a + b).powi(2)).sum();
499        let diff = diff_pos.min(diff_neg).sqrt();
500
501        v = v_new;
502
503        if diff < tolerance {
504            break;
505        }
506    }
507
508    // Eigenvalue: v^T C v
509    let cv = matrix.dot(&v);
510    let eigenvalue = v.dot(&cv);
511
512    (v.to_vec(), eigenvalue, iters)
513}
514
515#[cfg(test)]
516mod tests {
517    use super::*;
518
519    #[test]
520    fn test_pca_basic() {
521        // 2D data with clear primary direction along x=y
522        let data: Vec<Vec<f64>> = (0..100)
523            .map(|i| {
524                let x = i as f64;
525                let y = x + (i as f64 * 0.1).sin() * 2.0; // strong correlation
526                vec![x, y]
527            })
528            .collect();
529
530        let result = pca(&data, PcaConfig {
531            n_components: 2,
532            ..Default::default()
533        });
534
535        assert_eq!(result.components.len(), 2);
536        assert_eq!(result.explained_variance.len(), 2);
537
538        // First component should capture most variance
539        assert!(result.explained_variance_ratio[0] > 0.9,
540            "First component should explain >90% variance, got {}",
541            result.explained_variance_ratio[0]);
542
543        // First component should be roughly along [1, 1] / sqrt(2) direction
544        let c0 = &result.components[0];
545        let angle = (c0[0].abs() - c0[1].abs()).abs();
546        assert!(angle < 0.2, "First component should be near diagonal, got {:?}", c0);
547    }
548
549    #[test]
550    fn test_pca_identity() {
551        // When n_components == n_features, explained_variance_ratio sums to ~1.0
552        let data: Vec<Vec<f64>> = (0..50)
553            .map(|i| {
554                vec![
555                    i as f64,
556                    (i as f64 * 0.5).sin() * 10.0,
557                    (i * i) as f64 % 17.0,
558                ]
559            })
560            .collect();
561
562        let result = pca(&data, PcaConfig {
563            n_components: 3,
564            ..Default::default()
565        });
566
567        let total: f64 = result.explained_variance_ratio.iter().sum();
568        assert!((total - 1.0).abs() < 0.01,
569            "Total explained variance ratio should be ~1.0, got {}", total);
570    }
571
572    #[test]
573    fn test_pca_explained_variance_ratios_sum() {
574        let data: Vec<Vec<f64>> = (0..200)
575            .map(|i| {
576                let x = i as f64 * 0.1;
577                vec![x, x * 2.0 + 1.0, x.sin(), x.cos(), (x * 0.3).exp().min(100.0)]
578            })
579            .collect();
580
581        let result = pca(&data, PcaConfig {
582            n_components: 5,
583            ..Default::default()
584        });
585
586        let total: f64 = result.explained_variance_ratio.iter().sum();
587        assert!((total - 1.0).abs() < 0.05,
588            "Ratios should sum to ~1.0, got {}", total);
589
590        // Ratios should be in descending order
591        for i in 1..result.explained_variance_ratio.len() {
592            assert!(result.explained_variance_ratio[i] <= result.explained_variance_ratio[i - 1] + 1e-10,
593                "Ratios should be descending");
594        }
595    }
596
597    #[test]
598    fn test_pca_transform() {
599        // Create correlated 3D data
600        let data: Vec<Vec<f64>> = (0..100)
601            .map(|i| {
602                let x = i as f64;
603                vec![x, x * 1.5 + 3.0, x * 0.8 - 2.0]
604            })
605            .collect();
606
607        let result = pca(&data, PcaConfig {
608            n_components: 1,
609            ..Default::default()
610        });
611
612        // Project and check that reconstructing from 1 component preserves most info
613        let projected = result.transform(&data);
614        assert_eq!(projected.len(), 100);
615        assert_eq!(projected[0].len(), 1);
616
617        // First component should explain nearly all variance (perfectly correlated data)
618        assert!(result.explained_variance_ratio[0] > 0.99,
619            "Should explain >99% variance for perfectly correlated data, got {}",
620            result.explained_variance_ratio[0]);
621    }
622
623    #[test]
624    fn test_pca_centering() {
625        // Data with large offset should give same components as centered data
626        let offset = 1000.0;
627        let data: Vec<Vec<f64>> = (0..50)
628            .map(|i| vec![i as f64 + offset, (i as f64 * 2.0) + offset])
629            .collect();
630
631        let result = pca(&data, PcaConfig {
632            n_components: 2,
633            center: true,
634            ..Default::default()
635        });
636
637        // Mean should be approximately offset + 24.5 for x, offset + 49 for y
638        assert!((result.mean[0] - (offset + 24.5)).abs() < 0.01);
639        assert!((result.mean[1] - (offset + 49.0)).abs() < 0.01);
640
641        // Components should still capture the correlation direction
642        assert!(result.explained_variance_ratio[0] > 0.99);
643    }
644
645    #[test]
646    fn test_pca_convergence() {
647        // Simple data that should converge quickly
648        let data: Vec<Vec<f64>> = (0..20)
649            .map(|i| vec![i as f64, 0.0])
650            .collect();
651
652        let result = pca(&data, PcaConfig {
653            n_components: 1,
654            max_iterations: 100,
655            tolerance: 1e-10,
656            solver: PcaSolver::PowerIteration,
657            ..Default::default()
658        });
659
660        // Should converge well before 100 iterations for such simple data
661        assert!(result.iterations_used < 50,
662            "Should converge quickly, used {} iterations", result.iterations_used);
663
664        // First component should be [1, 0] (all variance along x)
665        let c0 = &result.components[0];
666        assert!(c0[0].abs() > 0.99, "Should be along x axis, got {:?}", c0);
667        assert!(c0[1].abs() < 0.1, "Should have near-zero y component, got {:?}", c0);
668    }
669
670    #[test]
671    fn test_pca_scaling() {
672        // Two features with very different scales
673        let data: Vec<Vec<f64>> = (0..100)
674            .map(|i| vec![i as f64, i as f64 * 1000.0])
675            .collect();
676
677        // Without scaling, second feature dominates
678        let result_no_scale = pca(&data, PcaConfig {
679            n_components: 2,
680            scale: false,
681            ..Default::default()
682        });
683        // First component should align with second feature (larger variance)
684        assert!(result_no_scale.components[0][1].abs() > result_no_scale.components[0][0].abs());
685
686        // With scaling, features are treated equally
687        let result_scaled = pca(&data, PcaConfig {
688            n_components: 2,
689            scale: true,
690            ..Default::default()
691        });
692        // Both features should contribute roughly equally to first component
693        let ratio = result_scaled.components[0][0].abs() / result_scaled.components[0][1].abs();
694        assert!(ratio > 0.5 && ratio < 2.0,
695            "Scaled components should be balanced, ratio = {}", ratio);
696    }
697
698    // ── New tests for randomized SVD ──
699
700    #[test]
701    fn test_pca_randomized_basic() {
702        // Generate data large enough to trigger randomized solver via Auto
703        let data: Vec<Vec<f64>> = (0..600)
704            .map(|i| {
705                let x = i as f64;
706                vec![x, x * 2.0 + 1.0, x.sin() * 10.0, (x * 0.01).cos() * 5.0]
707            })
708            .collect();
709
710        let result = pca(&data, PcaConfig {
711            n_components: 2,
712            solver: PcaSolver::Randomized { n_oversamples: 10, n_power_iters: 4 },
713            ..Default::default()
714        });
715
716        assert_eq!(result.components.len(), 2);
717        // First component should capture most variance (x and 2x dominate)
718        assert!(result.explained_variance_ratio[0] > 0.8,
719            "Randomized SVD: first component should explain >80% variance, got {}",
720            result.explained_variance_ratio[0]);
721    }
722
723    #[test]
724    fn test_pca_randomized_orthogonality() {
725        // Verify components from randomized SVD are orthogonal
726        let data: Vec<Vec<f64>> = (0..600)
727            .map(|i| {
728                let x = i as f64 * 0.1;
729                vec![x, x * 2.0 + 1.0, x.sin() * 10.0, x.cos() * 5.0, (x * 0.3).exp().min(100.0)]
730            })
731            .collect();
732
733        let result = pca(&data, PcaConfig {
734            n_components: 4,
735            solver: PcaSolver::Randomized { n_oversamples: 10, n_power_iters: 4 },
736            ..Default::default()
737        });
738
739        // Check pairwise orthogonality
740        for i in 0..result.components.len() {
741            for j in (i + 1)..result.components.len() {
742                let dot: f64 = result.components[i].iter()
743                    .zip(result.components[j].iter())
744                    .map(|(a, b)| a * b)
745                    .sum();
746                assert!(dot.abs() < 0.05,
747                    "Components {} and {} should be orthogonal, dot product = {}", i, j, dot);
748            }
749        }
750    }
751
752    #[test]
753    fn test_pca_auto_selects_randomized() {
754        // n=600, d=4, k=2 → k < 0.8 * min(600,4) = 3.2 → randomized
755        let data: Vec<Vec<f64>> = (0..600)
756            .map(|i| {
757                let x = i as f64;
758                vec![x, x * 2.0, x.sin(), x.cos()]
759            })
760            .collect();
761
762        let result = pca(&data, PcaConfig {
763            n_components: 2,
764            solver: PcaSolver::Auto,
765            ..Default::default()
766        });
767
768        // iterations_used should be the n_power_iters (4) for randomized, not >4
769        // (Power iteration would use many more iterations)
770        assert!(result.iterations_used <= 10,
771            "Auto should select randomized (iters={})", result.iterations_used);
772        assert_eq!(result.components.len(), 2);
773    }
774
775    #[test]
776    fn test_pca_solver_backward_compat() {
777        // Verify PowerIteration still works identically to before
778        let data: Vec<Vec<f64>> = (0..100)
779            .map(|i| {
780                let x = i as f64;
781                vec![x, x * 1.5 + 3.0, x * 0.8 - 2.0]
782            })
783            .collect();
784
785        let result = pca(&data, PcaConfig {
786            n_components: 2,
787            solver: PcaSolver::PowerIteration,
788            ..Default::default()
789        });
790
791        assert_eq!(result.components.len(), 2);
792        assert!(result.explained_variance_ratio[0] > 0.99,
793            "PowerIteration should explain >99% on correlated data, got {}",
794            result.explained_variance_ratio[0]);
795
796        // Verify orthogonality of components (improved by Gram-Schmidt)
797        let dot: f64 = result.components[0].iter()
798            .zip(result.components[1].iter())
799            .map(|(a, b)| a * b)
800            .sum();
801        assert!(dot.abs() < 0.01,
802            "PowerIteration components should be orthogonal, dot = {}", dot);
803    }
804
805    #[test]
806    fn test_pca_randomized_variance_sum() {
807        // Variance ratios from randomized SVD should sum close to 1.0 when k = d
808        let data: Vec<Vec<f64>> = (0..600)
809            .map(|i| {
810                let x = i as f64 * 0.1;
811                vec![x, x.sin() * 10.0, (x * 0.5).cos() * 5.0]
812            })
813            .collect();
814
815        let result = pca(&data, PcaConfig {
816            n_components: 3,
817            solver: PcaSolver::Randomized { n_oversamples: 10, n_power_iters: 4 },
818            ..Default::default()
819        });
820
821        let total: f64 = result.explained_variance_ratio.iter().sum();
822        assert!((total - 1.0).abs() < 0.1,
823            "Randomized SVD ratios should sum close to 1.0, got {}", total);
824    }
825
826    #[test]
827    fn test_pca_transform_batch() {
828        // Verify that batch transform matches individual transform_one calls
829        let data: Vec<Vec<f64>> = (0..100)
830            .map(|i| {
831                let x = i as f64;
832                vec![x, x * 2.0 + 1.0, x.sin() * 3.0]
833            })
834            .collect();
835
836        let result = pca(&data, PcaConfig {
837            n_components: 2,
838            ..Default::default()
839        });
840
841        let batch = result.transform(&data);
842        for (i, row) in data.iter().enumerate() {
843            let single = result.transform_one(row);
844            for (j, (&b, &s)) in batch[i].iter().zip(single.iter()).enumerate() {
845                assert!((b - s).abs() < 1e-10,
846                    "Batch[{}][{}]={} != single={}", i, j, b, s);
847            }
848        }
849    }
850}
samyama_graph_algorithms/pca.rs

samyama_graph_algorithms/
pca.rs