use std::fmt;
use ndarray::{Array1, ArrayBase, ArrayView2, Axis, Data, Ix2};
use rand::{rngs::SmallRng, seq::SliceRandom, SeedableRng};
use crate::dataset::DatasetBase;
use crate::Float;
fn pearson_correlation<F: Float, D: Data<Elem = F>>(data: &ArrayBase<D, Ix2>) -> Array1<F> {
let nobservations = data.nrows();
let nfeatures = data.ncols();
let mean = data.mean_axis(Axis(0)).unwrap();
let denoised = data - &mean.insert_axis(Axis(1)).t();
let covariance = denoised.t().dot(&denoised) / F::cast(nobservations - 1);
let std_deviation = denoised.var_axis(Axis(0), F::one()).mapv(|x| x.sqrt());
let mut pearson_coeffs = Array1::zeros(nfeatures * (nfeatures - 1) / 2);
let mut k = 0;
for i in 0..(nfeatures - 1) {
for j in (i + 1)..nfeatures {
pearson_coeffs[k] = covariance[(i, j)] / std_deviation[i] / std_deviation[j];
k += 1;
}
}
pearson_coeffs
}
fn p_values<F: Float, D: Data<Elem = F>>(
data: &ArrayBase<D, Ix2>,
ground: &Array1<F>,
num_iter: usize,
) -> Array1<F> {
let (n, m) = (data.ncols(), data.nrows());
let mut flattened = Vec::with_capacity(n * m);
for i in 0..m {
for j in 0..n {
flattened.push(data[(i, j)]);
}
}
let mut p_values = Array1::zeros(n * (n - 1) / 2);
let mut rng = SmallRng::from_entropy();
for _ in 0..num_iter {
for j in 0..n {
flattened[j * m..(j + 1) * m].shuffle(&mut rng);
}
let arr_view = ArrayView2::from_shape((m, n), &flattened).unwrap();
let correlation = pearson_correlation(&arr_view.t());
let greater = ground
.iter()
.zip(correlation.iter())
.map(|(a, b)| {
if a.abs() < b.abs() {
F::one()
} else {
F::zero()
}
})
.collect::<Array1<_>>();
p_values += &greater;
}
p_values / F::cast(num_iter)
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PearsonCorrelation<F> {
pearson_coeffs: Array1<F>,
p_values: Array1<F>,
feature_names: Vec<String>,
}
impl<F: Float> PearsonCorrelation<F> {
pub fn from_dataset<D: Data<Elem = F>, T>(
dataset: &DatasetBase<ArrayBase<D, Ix2>, T>,
num_iter: Option<usize>,
) -> Self {
let pearson_coeffs = pearson_correlation(dataset.records());
let p_values = match num_iter {
Some(num_iter) => p_values(dataset.records(), &pearson_coeffs, num_iter),
None => Array1::zeros(0),
};
PearsonCorrelation {
pearson_coeffs,
p_values,
feature_names: dataset.feature_names(),
}
}
pub fn get_coeffs(&self) -> &Array1<F> {
&self.pearson_coeffs
}
pub fn get_p_values(&self) -> Option<&Array1<F>> {
if self.p_values.is_empty() {
None
} else {
Some(&self.p_values)
}
}
}
impl<F: Float, D: Data<Elem = F>, T> DatasetBase<ArrayBase<D, Ix2>, T> {
pub fn pearson_correlation(&self) -> PearsonCorrelation<F> {
PearsonCorrelation::from_dataset(self, None)
}
pub fn pearson_correlation_with_p_value(&self, num_iter: usize) -> PearsonCorrelation<F> {
PearsonCorrelation::from_dataset(self, Some(num_iter))
}
}
impl<F: Float> fmt::Display for PearsonCorrelation<F> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let n = self.feature_names.len();
let longest = self.feature_names.iter().map(|x| x.len()).max().unwrap();
let mut k = 0;
for i in 0..(n - 1) {
write!(f, "{}", self.feature_names[i])?;
for _ in 0..longest - self.feature_names[i].len() {
write!(f, " ")?;
}
for _ in 0..i {
write!(f, " ")?;
}
for _ in (i + 1)..n {
if !self.p_values.is_empty() {
write!(
f,
"{:+.2} ({:.2}) ",
self.pearson_coeffs[k], self.p_values[k]
)?;
} else {
write!(f, "{:.2} ", self.pearson_coeffs[k])?;
}
k += 1;
}
writeln!(f,)?;
}
writeln!(f, "{}", self.feature_names[n - 1])?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use crate::DatasetBase;
use ndarray::{concatenate, Array, Axis};
use ndarray_rand::{rand_distr::Uniform, RandomExt};
use rand::{rngs::SmallRng, SeedableRng};
#[test]
fn uniform_random() {
let mut rng = SmallRng::seed_from_u64(42);
let data = Array::random_using((1000, 4), Uniform::new(-1., 1.), &mut rng);
let pcc = DatasetBase::from(data).pearson_correlation();
assert!(pcc.get_coeffs().mapv(|x: f32| x.abs()).sum() < 5e-2 * 6.0);
}
#[test]
fn perfectly_correlated() {
let mut rng = SmallRng::seed_from_u64(42);
let v = Array::random_using((4, 1), Uniform::new(0., 1.), &mut rng);
let data = Array::random_using((1000, 1), Uniform::new(-1., 1.), &mut rng);
let data_proj = data.dot(&v.t());
let corr = DatasetBase::from(concatenate![Axis(1), data, data_proj])
.pearson_correlation_with_p_value(100);
assert!(corr.get_coeffs().mapv(|x| 1. - x).sum() < 1e-2);
assert!(corr.get_p_values().unwrap().sum() < 1e-2);
}
}