graphmind_graph_algorithms/
pca.rs

1//! Principal Component Analysis (PCA)
2//!
3//! Implements dimensionality reduction for node feature matrices.
4//!
5//! Two solvers are available:
6//! - **Randomized SVD** (default): Halko-Martinsson-Tropp algorithm. O(n·d·k),
7//!   numerically stable, automatic orthogonality. Industry standard (scikit-learn,
8//!   cuML, Spark MLlib).
9//! - **Power Iteration** (legacy): Extract one eigenvector at a time from the
10//!   covariance matrix, then deflate and repeat. With Gram-Schmidt
11//!   re-orthogonalization for stability.
12
13use ndarray::{Array1, Array2, Axis};
14use rand::Rng;
15
16/// Solver strategy for PCA.
17#[derive(Debug, Clone, Default)]
18pub enum PcaSolver {
19    /// Automatically select solver based on data size:
20    /// Randomized if n > 500 and k < 0.8 * min(n, d), else PowerIteration.
21    #[default]
22    Auto,
23    /// Randomized SVD (Halko-Martinsson-Tropp).
24    Randomized {
25        /// Extra columns for accuracy (default: 10).
26        n_oversamples: usize,
27        /// Subspace power iterations for stability (default: 4).
28        n_power_iters: usize,
29    },
30    /// Legacy power iteration with deflation.
31    PowerIteration,
32}
33
34/// PCA configuration
35pub struct PcaConfig {
36    /// Number of principal components to extract
37    pub n_components: usize,
38    /// Maximum power iterations per component (only used for PowerIteration solver)
39    pub max_iterations: usize,
40    /// Convergence tolerance for power iteration (only used for PowerIteration solver)
41    pub tolerance: f64,
42    /// Subtract column means before PCA (default: true)
43    pub center: bool,
44    /// Divide by column std dev before PCA (default: false)
45    pub scale: bool,
46    /// Solver strategy (default: Auto)
47    pub solver: PcaSolver,
48}
49
50impl Default for PcaConfig {
51    fn default() -> Self {
52        Self {
53            n_components: 2,
54            max_iterations: 100,
55            tolerance: 1e-6,
56            center: true,
57            scale: false,
58            solver: PcaSolver::Auto,
59        }
60    }
61}
62
63/// PCA result containing components and explained variance
64pub struct PcaResult {
65    /// Principal component vectors (n_components x n_features), row-major
66    pub components: Vec<Vec<f64>>,
67    /// Variance explained by each component (eigenvalues)
68    pub explained_variance: Vec<f64>,
69    /// Proportion of total variance explained by each component
70    pub explained_variance_ratio: Vec<f64>,
71    /// Feature means used for centering (needed to project new data)
72    pub mean: Vec<f64>,
73    /// Feature standard deviations (if scaling was used)
74    pub std_dev: Vec<f64>,
75    /// Number of samples in the input data
76    pub n_samples: usize,
77    /// Number of features in the input data
78    pub n_features: usize,
79    /// Number of iterations used (last component for PowerIteration, power iters for Randomized)
80    pub iterations_used: usize,
81}
82
83impl PcaResult {
84    /// Project multiple data points into the reduced PCA space.
85    ///
86    /// Uses ndarray matrix multiplication for efficiency.
87    pub fn transform(&self, data: &[Vec<f64>]) -> Vec<Vec<f64>> {
88        let n = data.len();
89        if n == 0 || self.components.is_empty() {
90            return vec![];
91        }
92        let d = self.n_features;
93        let k = self.components.len();
94
95        // Build component matrix (k × d)
96        let comp_flat: Vec<f64> = self
97            .components
98            .iter()
99            .flat_map(|r| r.iter().copied())
100            .collect();
101        let comp_mat = Array2::from_shape_vec((k, d), comp_flat).unwrap();
102
103        // Build centered data matrix (n × d)
104        let data_flat: Vec<f64> = data
105            .iter()
106            .flat_map(|row| {
107                row.iter().enumerate().map(|(j, &val)| {
108                    let mut v = val - self.mean[j];
109                    if self.std_dev[j] > 0.0 && self.std_dev[j] != 1.0 {
110                        v /= self.std_dev[j];
111                    }
112                    v
113                })
114            })
115            .collect();
116        let data_mat = Array2::from_shape_vec((n, d), data_flat).unwrap();
117
118        // projected = data_mat @ comp_mat^T → (n × k)
119        let projected = data_mat.dot(&comp_mat.t());
120
121        // Convert back to Vec<Vec<f64>>
122        projected
123            .rows()
124            .into_iter()
125            .map(|row| row.to_vec())
126            .collect()
127    }
128
129    /// Project a single data point into the reduced PCA space.
130    pub fn transform_one(&self, point: &[f64]) -> Vec<f64> {
131        let k = self.components.len();
132        let d = self.n_features;
133        let mut result = vec![0.0; k];
134
135        for (c, component) in self.components.iter().enumerate() {
136            let mut dot = 0.0;
137            for j in 0..d {
138                let mut val = point[j] - self.mean[j];
139                if self.std_dev[j] > 0.0 && self.std_dev[j] != 1.0 {
140                    val /= self.std_dev[j];
141                }
142                dot += val * component[j];
143            }
144            result[c] = dot;
145        }
146        result
147    }
148}
149
150/// Run PCA on a feature matrix.
151///
152/// # Arguments
153/// - `data`: slice of rows, each row is a feature vector of equal length
154/// - `config`: PCA parameters (including solver selection)
155///
156/// # Panics
157/// Panics if `data` is empty or rows have inconsistent lengths.
158pub fn pca(data: &[Vec<f64>], config: PcaConfig) -> PcaResult {
159    let n = data.len();
160    assert!(n > 0, "PCA requires at least one data point");
161    let d = data[0].len();
162    assert!(d > 0, "PCA requires at least one feature");
163
164    let k = config.n_components.min(d).min(n);
165
166    // Build ndarray matrix from flat data (cache-friendly)
167    let flat: Vec<f64> = data
168        .iter()
169        .flat_map(|row| {
170            assert_eq!(
171                row.len(),
172                d,
173                "All rows must have the same number of features"
174            );
175            row.iter().copied()
176        })
177        .collect();
178    let mut mat = Array2::from_shape_vec((n, d), flat).unwrap();
179
180    // Compute column means and center using ndarray operations
181    let mut mean = vec![0.0; d];
182    if config.center {
183        let mean_arr = mat.mean_axis(Axis(0)).unwrap();
184        mat -= &mean_arr;
185        mean = mean_arr.to_vec();
186    }
187
188    // Compute column std devs and scale
189    let mut std_dev = vec![1.0; d];
190    if config.scale {
191        #[allow(clippy::needless_range_loop)]
192        for j in 0..d {
193            let col = mat.column(j);
194            let ss: f64 = col.iter().map(|x| x * x).sum();
195            let s = (ss / (n.max(2) - 1) as f64).sqrt();
196            std_dev[j] = s;
197            if s > 0.0 {
198                mat.column_mut(j).mapv_inplace(|x| x / s);
199            }
200        }
201    }
202
203    // Decide solver
204    let use_randomized = match &config.solver {
205        PcaSolver::Auto => {
206            let min_dim = n.min(d);
207            n > 500 && (k as f64) < 0.8 * min_dim as f64
208        }
209        PcaSolver::Randomized { .. } => true,
210        PcaSolver::PowerIteration => false,
211    };
212
213    let (n_oversamples, n_power_iters) = match &config.solver {
214        PcaSolver::Randomized {
215            n_oversamples,
216            n_power_iters,
217        } => (*n_oversamples, *n_power_iters),
218        _ => (10, 4),
219    };
220
221    if use_randomized {
222        // ── Randomized SVD path ──
223        let (components, singular_values) = randomized_svd(&mat, k, n_oversamples, n_power_iters);
224
225        // Eigenvalues (variance) = σ² / (n-1)
226        let denom = if n > 1 { (n - 1) as f64 } else { 1.0 };
227        let eigenvalues: Vec<f64> = singular_values.iter().map(|&s| s * s / denom).collect();
228
229        // Total variance from covariance diagonal
230        let total_variance = compute_total_variance(&mat, denom);
231
232        let explained_variance_ratio: Vec<f64> = eigenvalues
233            .iter()
234            .map(|&ev| {
235                if total_variance > 0.0 {
236                    ev / total_variance
237                } else {
238                    0.0
239                }
240            })
241            .collect();
242
243        PcaResult {
244            components,
245            explained_variance: eigenvalues,
246            explained_variance_ratio,
247            mean,
248            std_dev,
249            n_samples: n,
250            n_features: d,
251            iterations_used: n_power_iters,
252        }
253    } else {
254        // ── Power iteration with deflation path ──
255        let xt = mat.t();
256        let denom = if n > 1 { (n - 1) as f64 } else { 1.0 };
257        let mut cov = xt.dot(&mat) / denom;
258
259        // Capture total variance BEFORE deflation
260        let total_variance: f64 = (0..d).map(|i| cov[[i, i]]).sum();
261
262        let mut components: Vec<Vec<f64>> = Vec::with_capacity(k);
263        let mut eigenvalues: Vec<f64> = Vec::with_capacity(k);
264        let mut last_iters = 0;
265
266        for _ in 0..k {
267            let (eigvec, _eigval, iters) =
268                power_iteration(&cov, config.max_iterations, config.tolerance);
269            last_iters = iters;
270
271            // Gram-Schmidt re-orthogonalization against previous components
272            let mut new_vec = eigvec;
273            for prev in &components {
274                let dot: f64 = new_vec.iter().zip(prev).map(|(a, b)| a * b).sum();
275                for (v, p) in new_vec.iter_mut().zip(prev) {
276                    *v -= dot * p;
277                }
278            }
279            // Re-normalize after orthogonalization
280            let norm: f64 = new_vec.iter().map(|x| x * x).sum::<f64>().sqrt();
281            if norm > 1e-15 {
282                for v in &mut new_vec {
283                    *v /= norm;
284                }
285            }
286
287            // Recompute eigenvalue after orthogonalization: v^T C v
288            let v_arr = Array1::from(new_vec.clone());
289            let cv = cov.dot(&v_arr);
290            let eigval = v_arr.dot(&cv);
291
292            // Deflate: C = C - lambda * v * v^T
293            for r in 0..d {
294                for c in 0..d {
295                    cov[[r, c]] -= eigval * new_vec[r] * new_vec[c];
296                }
297            }
298
299            components.push(new_vec);
300            eigenvalues.push(eigval);
301        }
302
303        let explained_variance_ratio: Vec<f64> = eigenvalues
304            .iter()
305            .map(|&ev| {
306                if total_variance > 0.0 {
307                    ev / total_variance
308                } else {
309                    0.0
310                }
311            })
312            .collect();
313
314        PcaResult {
315            components,
316            explained_variance: eigenvalues,
317            explained_variance_ratio,
318            mean,
319            std_dev,
320            n_samples: n,
321            n_features: d,
322            iterations_used: last_iters,
323        }
324    }
325}
326
327/// Compute total variance from centered data matrix: sum of column variances.
328fn compute_total_variance(mat: &Array2<f64>, denom: f64) -> f64 {
329    let d = mat.ncols();
330    let mut total = 0.0;
331    for j in 0..d {
332        let col = mat.column(j);
333        let ss: f64 = col.iter().map(|x| x * x).sum();
334        total += ss / denom;
335    }
336    total
337}
338
339// ============================================================
340// Randomized SVD (Halko-Martinsson-Tropp)
341// ============================================================
342
343/// Randomized SVD: find top-k singular vectors of a centered data matrix.
344///
345/// Algorithm (Algorithm 5.1 from Halko et al. 2011):
346/// 1. Generate random Gaussian matrix Omega (d × (k+p))
347/// 2. Y = X @ Omega
348/// 3. Power iteration for stability: Y = (X @ X^T)^q @ Y, with QR between steps
349/// 4. QR factorization of Y → Q
350/// 5. B = Q^T @ X (small: (k+p) × d)
351/// 6. SVD of B via eigendecomposition of B @ B^T
352/// 7. Return top-k right singular vectors and singular values
353fn randomized_svd(
354    mat: &Array2<f64>,    // n × d, already centered
355    k: usize,             // number of components
356    n_oversamples: usize, // oversampling parameter
357    n_power_iters: usize, // power iterations for accuracy
358) -> (Vec<Vec<f64>>, Vec<f64>) {
359    let n = mat.nrows();
360    let d = mat.ncols();
361    let l = (k + n_oversamples).min(n).min(d); // sketch width
362
363    // Stage 1: Random projection
364    let mut rng = rand::thread_rng();
365    let omega_flat: Vec<f64> = (0..d * l)
366        .map(|_| rng.sample::<f64, _>(rand::distributions::Standard))
367        .collect();
368    let omega = Array2::from_shape_vec((d, l), omega_flat).unwrap();
369
370    // Y = X @ Omega  (n × l)
371    let mut y = mat.dot(&omega);
372
373    // Power iteration for stability: repeat q times
374    // Y = X @ (X^T @ Y), then QR factorize Y
375    for _ in 0..n_power_iters {
376        // QR factorize Y to maintain numerical stability
377        qr_modified_gram_schmidt(&mut y);
378        // Y = X @ (X^T @ Y)
379        let xty = mat.t().dot(&y); // d × l
380        y = mat.dot(&xty); // n × l
381    }
382
383    // Final QR to get orthonormal basis Q
384    qr_modified_gram_schmidt(&mut y);
385    let q = y; // n × l, orthonormal columns
386
387    // Stage 2: Form small matrix B = Q^T @ X  (l × d)
388    let b = q.t().dot(mat);
389
390    // SVD of B via eigendecomposition of B @ B^T  (l × l, small)
391    let bbt = b.dot(&b.t());
392    let l_actual = bbt.nrows();
393
394    // Eigen-decompose the small l×l matrix using power iteration
395    // (we need all l eigenvectors, but we only keep top k)
396    let (eigvecs_left, eigvals) = symmetric_eigen(&bbt, l_actual);
397
398    // Right singular vectors of B: V_i = B^T @ U_i / σ_i
399    // Components are the right singular vectors of X (rows of V^T)
400    let mut components = Vec::with_capacity(k);
401    let mut singular_values = Vec::with_capacity(k);
402
403    #[allow(clippy::needless_range_loop)]
404    for i in 0..k.min(l_actual) {
405        let sigma_sq = eigvals[i];
406        if sigma_sq < 1e-30 {
407            break;
408        }
409        let sigma = sigma_sq.sqrt();
410        singular_values.push(sigma);
411
412        // u_i = eigvecs_left column i
413        let u_i = eigvecs_left.column(i);
414        // v_i = B^T @ u_i / sigma
415        let v_i = b.t().dot(&u_i) / sigma;
416        components.push(v_i.to_vec());
417    }
418
419    (components, singular_values)
420}
421
422/// Symmetric eigendecomposition of a small matrix using power iteration + deflation.
423///
424/// Returns (eigenvectors as columns of Array2, eigenvalues sorted descending).
425fn symmetric_eigen(mat: &Array2<f64>, k: usize) -> (Array2<f64>, Vec<f64>) {
426    let d = mat.nrows();
427    let mut deflated = mat.clone();
428    let mut eigvecs = Array2::<f64>::zeros((d, k));
429    let mut eigvals = Vec::with_capacity(k);
430
431    for i in 0..k {
432        let (vec, _val, _) = power_iteration(&deflated, 200, 1e-12);
433
434        // Gram-Schmidt against previous eigenvectors
435        let mut v = Array1::from(vec);
436        for j in 0..i {
437            let prev = eigvecs.column(j);
438            let dot = prev.dot(&v);
439            v -= &(&prev * dot);
440        }
441        let norm = v.dot(&v).sqrt();
442        if norm > 1e-15 {
443            v /= norm;
444        }
445
446        // Recompute eigenvalue after orthogonalization
447        let av = deflated.dot(&v);
448        let val = v.dot(&av);
449
450        // Deflate
451        for r in 0..d {
452            for c in 0..d {
453                deflated[[r, c]] -= val * v[r] * v[c];
454            }
455        }
456
457        eigvecs.column_mut(i).assign(&v);
458        eigvals.push(val);
459    }
460
461    (eigvecs, eigvals)
462}
463
464// ============================================================
465// QR factorization (Modified Gram-Schmidt)
466// ============================================================
467
468/// Modified Gram-Schmidt QR factorization in-place.
469///
470/// Replaces columns of `mat` with orthonormal vectors spanning the same space.
471fn qr_modified_gram_schmidt(mat: &mut Array2<f64>) {
472    let ncols = mat.ncols();
473    for j in 0..ncols {
474        let mut col_j = mat.column(j).to_owned();
475        for i in 0..j {
476            let col_i = mat.column(i).to_owned();
477            let r = col_i.dot(&col_j);
478            col_j -= &(&col_i * r);
479        }
480        let norm = col_j.dot(&col_j).sqrt();
481        if norm > 1e-15 {
482            col_j /= norm;
483        }
484        mat.column_mut(j).assign(&col_j);
485    }
486}
487
488// ============================================================
489// Power iteration (used by both solvers and internal eigen)
490// ============================================================
491
492/// Power iteration: find the dominant eigenvector of a symmetric matrix.
493///
494/// Returns (eigenvector, eigenvalue, iterations_used).
495fn power_iteration(
496    matrix: &Array2<f64>,
497    max_iters: usize,
498    tolerance: f64,
499) -> (Vec<f64>, f64, usize) {
500    let d = matrix.nrows();
501
502    // Initialize with a vector that has some structure to avoid degenerate starts
503    let mut v = Array1::<f64>::zeros(d);
504    for i in 0..d {
505        v[i] = ((i + 1) as f64).sqrt();
506    }
507    let norm = v.dot(&v).sqrt();
508    if norm > 0.0 {
509        v /= norm;
510    }
511
512    let mut iters = 0;
513    for iter in 0..max_iters {
514        iters = iter + 1;
515
516        // w = C * v
517        let w = matrix.dot(&v);
518
519        // Normalize
520        let w_norm = w.dot(&w).sqrt();
521        if w_norm < 1e-15 {
522            // Matrix has zero eigenvalue in this direction
523            break;
524        }
525        let v_new = &w / w_norm;
526
527        // Check convergence: |v_new - v| or |v_new + v| (sign may flip)
528        let diff_pos: f64 = v_new
529            .iter()
530            .zip(v.iter())
531            .map(|(a, b)| (a - b).powi(2))
532            .sum();
533        let diff_neg: f64 = v_new
534            .iter()
535            .zip(v.iter())
536            .map(|(a, b)| (a + b).powi(2))
537            .sum();
538        let diff = diff_pos.min(diff_neg).sqrt();
539
540        v = v_new;
541
542        if diff < tolerance {
543            break;
544        }
545    }
546
547    // Eigenvalue: v^T C v
548    let cv = matrix.dot(&v);
549    let eigenvalue = v.dot(&cv);
550
551    (v.to_vec(), eigenvalue, iters)
552}
553
554#[cfg(test)]
555mod tests {
556    use super::*;
557
558    #[test]
559    fn test_pca_basic() {
560        // 2D data with clear primary direction along x=y
561        let data: Vec<Vec<f64>> = (0..100)
562            .map(|i| {
563                let x = i as f64;
564                let y = x + (i as f64 * 0.1).sin() * 2.0; // strong correlation
565                vec![x, y]
566            })
567            .collect();
568
569        let result = pca(
570            &data,
571            PcaConfig {
572                n_components: 2,
573                ..Default::default()
574            },
575        );
576
577        assert_eq!(result.components.len(), 2);
578        assert_eq!(result.explained_variance.len(), 2);
579
580        // First component should capture most variance
581        assert!(
582            result.explained_variance_ratio[0] > 0.9,
583            "First component should explain >90% variance, got {}",
584            result.explained_variance_ratio[0]
585        );
586
587        // First component should be roughly along [1, 1] / sqrt(2) direction
588        let c0 = &result.components[0];
589        let angle = (c0[0].abs() - c0[1].abs()).abs();
590        assert!(
591            angle < 0.2,
592            "First component should be near diagonal, got {:?}",
593            c0
594        );
595    }
596
597    #[test]
598    fn test_pca_identity() {
599        // When n_components == n_features, explained_variance_ratio sums to ~1.0
600        let data: Vec<Vec<f64>> = (0..50)
601            .map(|i| {
602                vec![
603                    i as f64,
604                    (i as f64 * 0.5).sin() * 10.0,
605                    (i * i) as f64 % 17.0,
606                ]
607            })
608            .collect();
609
610        let result = pca(
611            &data,
612            PcaConfig {
613                n_components: 3,
614                ..Default::default()
615            },
616        );
617
618        let total: f64 = result.explained_variance_ratio.iter().sum();
619        assert!(
620            (total - 1.0).abs() < 0.01,
621            "Total explained variance ratio should be ~1.0, got {}",
622            total
623        );
624    }
625
626    #[test]
627    fn test_pca_explained_variance_ratios_sum() {
628        let data: Vec<Vec<f64>> = (0..200)
629            .map(|i| {
630                let x = i as f64 * 0.1;
631                vec![
632                    x,
633                    x * 2.0 + 1.0,
634                    x.sin(),
635                    x.cos(),
636                    (x * 0.3).exp().min(100.0),
637                ]
638            })
639            .collect();
640
641        let result = pca(
642            &data,
643            PcaConfig {
644                n_components: 5,
645                ..Default::default()
646            },
647        );
648
649        let total: f64 = result.explained_variance_ratio.iter().sum();
650        assert!(
651            (total - 1.0).abs() < 0.05,
652            "Ratios should sum to ~1.0, got {}",
653            total
654        );
655
656        // Ratios should be in descending order
657        for i in 1..result.explained_variance_ratio.len() {
658            assert!(
659                result.explained_variance_ratio[i]
660                    <= result.explained_variance_ratio[i - 1] + 1e-10,
661                "Ratios should be descending"
662            );
663        }
664    }
665
666    #[test]
667    fn test_pca_transform() {
668        // Create correlated 3D data
669        let data: Vec<Vec<f64>> = (0..100)
670            .map(|i| {
671                let x = i as f64;
672                vec![x, x * 1.5 + 3.0, x * 0.8 - 2.0]
673            })
674            .collect();
675
676        let result = pca(
677            &data,
678            PcaConfig {
679                n_components: 1,
680                ..Default::default()
681            },
682        );
683
684        // Project and check that reconstructing from 1 component preserves most info
685        let projected = result.transform(&data);
686        assert_eq!(projected.len(), 100);
687        assert_eq!(projected[0].len(), 1);
688
689        // First component should explain nearly all variance (perfectly correlated data)
690        assert!(
691            result.explained_variance_ratio[0] > 0.99,
692            "Should explain >99% variance for perfectly correlated data, got {}",
693            result.explained_variance_ratio[0]
694        );
695    }
696
697    #[test]
698    fn test_pca_centering() {
699        // Data with large offset should give same components as centered data
700        let offset = 1000.0;
701        let data: Vec<Vec<f64>> = (0..50)
702            .map(|i| vec![i as f64 + offset, (i as f64 * 2.0) + offset])
703            .collect();
704
705        let result = pca(
706            &data,
707            PcaConfig {
708                n_components: 2,
709                center: true,
710                ..Default::default()
711            },
712        );
713
714        // Mean should be approximately offset + 24.5 for x, offset + 49 for y
715        assert!((result.mean[0] - (offset + 24.5)).abs() < 0.01);
716        assert!((result.mean[1] - (offset + 49.0)).abs() < 0.01);
717
718        // Components should still capture the correlation direction
719        assert!(result.explained_variance_ratio[0] > 0.99);
720    }
721
722    #[test]
723    fn test_pca_convergence() {
724        // Simple data that should converge quickly
725        let data: Vec<Vec<f64>> = (0..20).map(|i| vec![i as f64, 0.0]).collect();
726
727        let result = pca(
728            &data,
729            PcaConfig {
730                n_components: 1,
731                max_iterations: 100,
732                tolerance: 1e-10,
733                solver: PcaSolver::PowerIteration,
734                ..Default::default()
735            },
736        );
737
738        // Should converge well before 100 iterations for such simple data
739        assert!(
740            result.iterations_used < 50,
741            "Should converge quickly, used {} iterations",
742            result.iterations_used
743        );
744
745        // First component should be [1, 0] (all variance along x)
746        let c0 = &result.components[0];
747        assert!(c0[0].abs() > 0.99, "Should be along x axis, got {:?}", c0);
748        assert!(
749            c0[1].abs() < 0.1,
750            "Should have near-zero y component, got {:?}",
751            c0
752        );
753    }
754
755    #[test]
756    fn test_pca_scaling() {
757        // Two features with very different scales
758        let data: Vec<Vec<f64>> = (0..100)
759            .map(|i| vec![i as f64, i as f64 * 1000.0])
760            .collect();
761
762        // Without scaling, second feature dominates
763        let result_no_scale = pca(
764            &data,
765            PcaConfig {
766                n_components: 2,
767                scale: false,
768                ..Default::default()
769            },
770        );
771        // First component should align with second feature (larger variance)
772        assert!(result_no_scale.components[0][1].abs() > result_no_scale.components[0][0].abs());
773
774        // With scaling, features are treated equally
775        let result_scaled = pca(
776            &data,
777            PcaConfig {
778                n_components: 2,
779                scale: true,
780                ..Default::default()
781            },
782        );
783        // Both features should contribute roughly equally to first component
784        let ratio = result_scaled.components[0][0].abs() / result_scaled.components[0][1].abs();
785        assert!(
786            ratio > 0.5 && ratio < 2.0,
787            "Scaled components should be balanced, ratio = {}",
788            ratio
789        );
790    }
791
792    // ── New tests for randomized SVD ──
793
794    #[test]
795    fn test_pca_randomized_basic() {
796        // Generate data large enough to trigger randomized solver via Auto
797        let data: Vec<Vec<f64>> = (0..600)
798            .map(|i| {
799                let x = i as f64;
800                vec![x, x * 2.0 + 1.0, x.sin() * 10.0, (x * 0.01).cos() * 5.0]
801            })
802            .collect();
803
804        let result = pca(
805            &data,
806            PcaConfig {
807                n_components: 2,
808                solver: PcaSolver::Randomized {
809                    n_oversamples: 10,
810                    n_power_iters: 4,
811                },
812                ..Default::default()
813            },
814        );
815
816        assert_eq!(result.components.len(), 2);
817        // First component should capture most variance (x and 2x dominate)
818        assert!(
819            result.explained_variance_ratio[0] > 0.8,
820            "Randomized SVD: first component should explain >80% variance, got {}",
821            result.explained_variance_ratio[0]
822        );
823    }
824
825    #[test]
826    fn test_pca_randomized_orthogonality() {
827        // Verify components from randomized SVD are orthogonal
828        let data: Vec<Vec<f64>> = (0..600)
829            .map(|i| {
830                let x = i as f64 * 0.1;
831                vec![
832                    x,
833                    x * 2.0 + 1.0,
834                    x.sin() * 10.0,
835                    x.cos() * 5.0,
836                    (x * 0.3).exp().min(100.0),
837                ]
838            })
839            .collect();
840
841        let result = pca(
842            &data,
843            PcaConfig {
844                n_components: 4,
845                solver: PcaSolver::Randomized {
846                    n_oversamples: 10,
847                    n_power_iters: 4,
848                },
849                ..Default::default()
850            },
851        );
852
853        // Check pairwise orthogonality
854        for i in 0..result.components.len() {
855            for j in (i + 1)..result.components.len() {
856                let dot: f64 = result.components[i]
857                    .iter()
858                    .zip(result.components[j].iter())
859                    .map(|(a, b)| a * b)
860                    .sum();
861                assert!(
862                    dot.abs() < 0.05,
863                    "Components {} and {} should be orthogonal, dot product = {}",
864                    i,
865                    j,
866                    dot
867                );
868            }
869        }
870    }
871
872    #[test]
873    fn test_pca_auto_selects_randomized() {
874        // n=600, d=4, k=2 → k < 0.8 * min(600,4) = 3.2 → randomized
875        let data: Vec<Vec<f64>> = (0..600)
876            .map(|i| {
877                let x = i as f64;
878                vec![x, x * 2.0, x.sin(), x.cos()]
879            })
880            .collect();
881
882        let result = pca(
883            &data,
884            PcaConfig {
885                n_components: 2,
886                solver: PcaSolver::Auto,
887                ..Default::default()
888            },
889        );
890
891        // iterations_used should be the n_power_iters (4) for randomized, not >4
892        // (Power iteration would use many more iterations)
893        assert!(
894            result.iterations_used <= 10,
895            "Auto should select randomized (iters={})",
896            result.iterations_used
897        );
898        assert_eq!(result.components.len(), 2);
899    }
900
901    #[test]
902    fn test_pca_solver_backward_compat() {
903        // Verify PowerIteration still works identically to before
904        let data: Vec<Vec<f64>> = (0..100)
905            .map(|i| {
906                let x = i as f64;
907                vec![x, x * 1.5 + 3.0, x * 0.8 - 2.0]
908            })
909            .collect();
910
911        let result = pca(
912            &data,
913            PcaConfig {
914                n_components: 2,
915                solver: PcaSolver::PowerIteration,
916                ..Default::default()
917            },
918        );
919
920        assert_eq!(result.components.len(), 2);
921        assert!(
922            result.explained_variance_ratio[0] > 0.99,
923            "PowerIteration should explain >99% on correlated data, got {}",
924            result.explained_variance_ratio[0]
925        );
926
927        // Verify orthogonality of components (improved by Gram-Schmidt)
928        let dot: f64 = result.components[0]
929            .iter()
930            .zip(result.components[1].iter())
931            .map(|(a, b)| a * b)
932            .sum();
933        assert!(
934            dot.abs() < 0.01,
935            "PowerIteration components should be orthogonal, dot = {}",
936            dot
937        );
938    }
939
940    #[test]
941    fn test_pca_randomized_variance_sum() {
942        // Variance ratios from randomized SVD should sum close to 1.0 when k = d
943        let data: Vec<Vec<f64>> = (0..600)
944            .map(|i| {
945                let x = i as f64 * 0.1;
946                vec![x, x.sin() * 10.0, (x * 0.5).cos() * 5.0]
947            })
948            .collect();
949
950        let result = pca(
951            &data,
952            PcaConfig {
953                n_components: 3,
954                solver: PcaSolver::Randomized {
955                    n_oversamples: 10,
956                    n_power_iters: 4,
957                },
958                ..Default::default()
959            },
960        );
961
962        let total: f64 = result.explained_variance_ratio.iter().sum();
963        assert!(
964            (total - 1.0).abs() < 0.1,
965            "Randomized SVD ratios should sum close to 1.0, got {}",
966            total
967        );
968    }
969
970    #[test]
971    fn test_pca_transform_batch() {
972        // Verify that batch transform matches individual transform_one calls
973        let data: Vec<Vec<f64>> = (0..100)
974            .map(|i| {
975                let x = i as f64;
976                vec![x, x * 2.0 + 1.0, x.sin() * 3.0]
977            })
978            .collect();
979
980        let result = pca(
981            &data,
982            PcaConfig {
983                n_components: 2,
984                ..Default::default()
985            },
986        );
987
988        let batch = result.transform(&data);
989        for (i, row) in data.iter().enumerate() {
990            let single = result.transform_one(row);
991            for (j, (&b, &s)) in batch[i].iter().zip(single.iter()).enumerate() {
992                assert!(
993                    (b - s).abs() < 1e-10,
994                    "Batch[{}][{}]={} != single={}",
995                    i,
996                    j,
997                    b,
998                    s
999                );
1000            }
1001        }
1002    }
1003}
graphmind_graph_algorithms/pca.rs

graphmind_graph_algorithms/
pca.rs