use crate::error::{ReductionError, Result};
#[cfg(not(feature = "blas"))]
use linfa_linalg::{lobpcg::TruncatedSvd, Order};
use ndarray::{Array1, Array2, ArrayBase, Axis, Data, Ix2};
#[cfg(feature = "blas")]
use ndarray_linalg::{TruncatedOrder, TruncatedSvd};
use rand::{prelude::SmallRng, SeedableRng};
#[cfg(feature = "serde")]
use serde_crate::{Deserialize, Serialize};
use linfa::{
dataset::Records,
traits::{Fit, PredictInplace, Transformer},
DatasetBase, Float,
};
#[cfg_attr(
feature = "serde",
derive(Serialize, Deserialize),
serde(crate = "serde_crate")
)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PcaParams {
embedding_size: usize,
apply_whitening: bool,
}
impl PcaParams {
pub fn whiten(mut self, apply: bool) -> Self {
self.apply_whitening = apply;
self
}
}
impl<T, D: Data<Elem = f64>> Fit<ArrayBase<D, Ix2>, T, ReductionError> for PcaParams {
type Object = Pca<f64>;
fn fit(&self, dataset: &DatasetBase<ArrayBase<D, Ix2>, T>) -> Result<Pca<f64>> {
if dataset.nsamples() == 0 {
return Err(ReductionError::NotEnoughSamples);
} else if dataset.nfeatures() < self.embedding_size || self.embedding_size == 0 {
return Err(ReductionError::EmbeddingTooSmall(self.embedding_size));
}
let x = dataset.records();
let mean = x.mean_axis(Axis(0)).unwrap();
let x = x - &mean;
#[cfg(feature = "blas")]
let result =
TruncatedSvd::new(x, TruncatedOrder::Largest).decompose(self.embedding_size)?;
#[cfg(not(feature = "blas"))]
let result = TruncatedSvd::new_with_rng(x, Order::Largest, SmallRng::seed_from_u64(42))
.decompose(self.embedding_size)?;
let (_, sigma, mut v_t) = result.values_vectors();
let sigma = sigma.mapv(|x| x.max(1e-8));
if self.apply_whitening {
let cov_scale = (dataset.nsamples() as f64 - 1.).sqrt();
for (mut v_t, sigma) in v_t.axis_iter_mut(Axis(0)).zip(sigma.iter()) {
v_t *= cov_scale / *sigma;
}
}
Ok(Pca {
embedding: v_t,
sigma,
mean,
})
}
}
#[cfg_attr(
feature = "serde",
derive(Serialize, Deserialize),
serde(crate = "serde_crate")
)]
#[derive(Debug, Clone, PartialEq)]
pub struct Pca<F> {
embedding: Array2<F>,
sigma: Array1<F>,
mean: Array1<F>,
}
impl Pca<f64> {
pub fn params(embedding_size: usize) -> PcaParams {
PcaParams {
embedding_size,
apply_whitening: false,
}
}
pub fn explained_variance(&self) -> Array1<f64> {
self.sigma.mapv(|x| x * x / (self.sigma.len() as f64 - 1.0))
}
pub fn explained_variance_ratio(&self) -> Array1<f64> {
let ex_var = self.sigma.mapv(|x| x * x / (self.sigma.len() as f64 - 1.0));
let sum_ex_var = ex_var.sum();
ex_var / sum_ex_var
}
pub fn components(&self) -> &Array2<f64> {
&self.embedding
}
pub fn mean(&self) -> &Array1<f64> {
&self.mean
}
pub fn singular_values(&self) -> &Array1<f64> {
&self.sigma
}
pub fn inverse_transform(
&self,
prediction: ArrayBase<ndarray::OwnedRepr<f64>, ndarray::Dim<[usize; 2]>>,
) -> ArrayBase<ndarray::OwnedRepr<f64>, ndarray::Dim<[usize; 2]>> {
prediction.dot(&self.embedding) + &self.mean
}
}
impl<F: Float, D: Data<Elem = F>> PredictInplace<ArrayBase<D, Ix2>, Array2<F>> for Pca<F> {
fn predict_inplace(&self, records: &ArrayBase<D, Ix2>, targets: &mut Array2<F>) {
assert_eq!(
targets.shape(),
&[records.nrows(), self.embedding.nrows()],
"The number of data points must match the number of output targets."
);
*targets = (records - &self.mean).dot(&self.embedding.t());
}
fn default_target(&self, x: &ArrayBase<D, Ix2>) -> Array2<F> {
Array2::zeros((x.nrows(), self.embedding.nrows()))
}
}
impl<F: Float, D: Data<Elem = F>, T>
Transformer<DatasetBase<ArrayBase<D, Ix2>, T>, DatasetBase<Array2<F>, T>> for Pca<F>
{
fn transform(&self, ds: DatasetBase<ArrayBase<D, Ix2>, T>) -> DatasetBase<Array2<F>, T> {
let DatasetBase {
records,
targets,
weights,
..
} = ds;
let mut new_records = self.default_target(&records);
self.predict_inplace(&records, &mut new_records);
DatasetBase::new(new_records, targets).with_weights(weights)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{DiffusionMap, DiffusionMapParams, DiffusionMapValidParams};
use approx::assert_abs_diff_eq;
use linfa::{traits::Predict, Dataset};
use ndarray::{array, Array2};
use ndarray_rand::{
rand_distr::{StandardNormal, Uniform},
RandomExt,
};
use rand::{rngs::SmallRng, SeedableRng};
#[test]
fn autotraits() {
fn has_autotraits<T: Send + Sync + Sized + Unpin>() {}
has_autotraits::<DiffusionMap<f64>>();
has_autotraits::<DiffusionMapValidParams>();
has_autotraits::<DiffusionMapParams>();
has_autotraits::<ReductionError>();
has_autotraits::<PcaParams>();
has_autotraits::<Pca<f64>>();
}
#[test]
fn test_whitening_small() {
let mut rng = SmallRng::seed_from_u64(42);
let tmp = Array2::random_using((300, 2), Uniform::new(-1.0f64, 1.), &mut rng);
let q = array![[1., 1.], [-1., 1.]];
let dataset = Dataset::from(tmp.dot(&q));
let model = Pca::params(2).whiten(true).fit(&dataset).unwrap();
let proj = model.predict(&dataset);
let cov = proj.t().dot(&proj);
assert_abs_diff_eq!(cov / (300. - 1.), Array2::eye(2), epsilon = 1e-5);
}
#[test]
fn test_whitening_rand() {
let mut rng = SmallRng::seed_from_u64(42);
let data = Array2::random_using((300, 50), Uniform::new(-1.0f64, 1.), &mut rng);
let dataset = Dataset::from(data);
let model = Pca::params(10).whiten(true).fit(&dataset).unwrap();
let proj = model.predict(&dataset);
let cov = proj.t().dot(&proj);
assert_abs_diff_eq!(cov / (300. - 1.), Array2::eye(10), epsilon = 1e-5);
}
#[test]
fn test_marchenko_pastur() {
let mut rng = SmallRng::seed_from_u64(3);
let data = Array2::random_using((1000, 500), StandardNormal, &mut rng);
let dataset = Dataset::from(data / 1000f64.sqrt());
let model = Pca::params(500).fit(&dataset).unwrap();
let sv = model.singular_values().mapv(|x| x * x);
let (a, b) = (
1. * (1. - 0.5f64.sqrt()).powf(2.0),
1. * (1. + 0.5f64.sqrt()).powf(2.0),
);
assert_abs_diff_eq!(b, sv[0], epsilon = 0.1);
assert_abs_diff_eq!(a, sv[sv.len() - 1], epsilon = 0.1);
let mut i = 0;
'outer: for th in Array1::linspace(0.1, 2.8, 28).iter().rev() {
let mut count = 0;
while sv[i] >= *th {
count += 1;
i += 1;
if i == sv.len() {
break 'outer;
}
}
let x = th + 0.05;
let mp_law = ((b - x) * (x - a)).sqrt() / std::f64::consts::PI / x;
let empirical = count as f64 / 500. / ((2.8 - 0.1) / 28.);
assert_abs_diff_eq!(mp_law, empirical, epsilon = 0.06);
}
}
#[test]
fn test_explained_variance_cutoff() {
let mut rng = SmallRng::seed_from_u64(42);
let n = 500;
let mut a = Array1::<f64>::random_using(n, StandardNormal, &mut rng);
a /= (a.t().dot(&a)).sqrt();
let mut b = Array1::random_using(n, StandardNormal, &mut rng);
b -= &(b.t().dot(&a) * &a);
b /= (b.t().dot(&b)).sqrt();
let data =
Array2::from_shape_fn((500, 500), |dim| a[dim.0] * a[dim.1] + b[dim.0] * b[dim.1]);
let dataset = Dataset::from(data);
let model = Pca::params(10).fit(&dataset).unwrap();
assert_eq!(model.explained_variance_ratio().len(), 2);
assert_abs_diff_eq!(
model.explained_variance_ratio(),
array![1. / 2., 1. / 2.],
epsilon = 1e-2
);
}
#[test]
fn test_explained_variance_diag() {
let dataset = Dataset::from(Array2::from_diag(&array![1., 1., 1., 1.]));
let model = Pca::params(3).fit(&dataset).unwrap();
assert_abs_diff_eq!(
model.explained_variance_ratio(),
array![1. / 3., 1. / 3., 1. / 3.],
epsilon = 1e-6
);
}
}