1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
///! Principal Component Analysis
///
/// Reduce dimensionality with a linear projection using Singular Value Decomposition. The data is
/// centered before applying the SVD. This uses TruncatedSvd from ndarray-linalg package.
use ndarray::{Array1, Array2, ArrayBase, Axis, Data, Ix2};
use ndarray_linalg::{TruncatedOrder, TruncatedSvd};

use linfa::{
    traits::{Fit, Predict},
    Dataset, Float,
};

/// Pincipal Component Analysis
pub struct PrincipalComponentAnalysisParams {
    embedding_size: usize,
}

impl<'a> Fit<'a, Array2<f64>, ()> for PrincipalComponentAnalysisParams {
    type Object = Pca<f64>;

    fn fit(&self, dataset: &Dataset<Array2<f64>, ()>) -> Pca<f64> {
        let mut x = dataset.records().to_owned();
        // calculate mean of data and subtract it
        let mean = x.mean_axis(Axis(0)).unwrap();
        x -= &mean;

        // estimate Singular Value Decomposition
        let result = TruncatedSvd::new(x, TruncatedOrder::Largest)
            .decompose(self.embedding_size)
            .unwrap();

        // explained variance is the spectral distribution of the eigenvalues
        let (_, sigma, v_t) = result.values_vectors();
        let explained_variance = sigma.mapv(|x| x * x / (sigma.len() as f64 - 1.0));

        Pca {
            embedding: v_t,
            explained_variance,
            mean,
        }
    }
}

pub struct Pca<F> {
    embedding: Array2<F>,
    explained_variance: Array1<F>,
    mean: Array1<F>,
}

impl Pca<f64> {
    pub fn params(size: usize) -> PrincipalComponentAnalysisParams {
        PrincipalComponentAnalysisParams {
            embedding_size: size,
        }
    }

    /// Return the amount of explained variance per element
    pub fn explained_variance(&self) -> Array1<f64> {
        self.explained_variance.clone()
    }

    /// Return the normalized amount of explained variance per element
    pub fn explained_variance_ratio(&self) -> Array1<f64> {
        &self.explained_variance / self.explained_variance.sum()
    }
}

impl<F: Float, D: Data<Elem = F>> Predict<ArrayBase<D, Ix2>, Array2<F>> for Pca<F> {
    /// Given a new data points project with fitted model
    fn predict(&self, x: ArrayBase<D, Ix2>) -> Array2<F> {
        (&x - &self.mean).dot(&self.embedding.t())
    }
}