linfa 0.7.0

A Machine Learning framework for Rust
Documentation
//! Correlation analysis for dataset features
//!
//! # Implementations
//!
//! * Pearsons's Correlation Coefficients - linear feature correlation
use std::fmt;

use ndarray::{Array1, ArrayBase, ArrayView2, Axis, Data, Ix2};
use rand::{rngs::SmallRng, seq::SliceRandom, SeedableRng};

use crate::dataset::DatasetBase;
use crate::Float;

/// Calculate the Pearson's Correlation Coefficient (or bivariate correlation)
///
/// The PCC describes the linear correlation between two variables. It is the covariance divided by
/// the product of the standard deviations, therefore essentially a normalised measurement of the
/// covariance and in range (-1, 1). A negative coefficient indicates a negative correlation
/// between both variables.
fn pearson_correlation<F: Float, D: Data<Elem = F>>(data: &ArrayBase<D, Ix2>) -> Array1<F> {
    // number of obserations and features
    let nobservations = data.nrows();
    let nfeatures = data.ncols();

    // center distribution by subtracting mean
    let mean = data.mean_axis(Axis(0)).unwrap();
    //let std_deviation = mean.clone();
    let denoised = data - &mean.insert_axis(Axis(1)).t();

    // calculate the covariance matrix
    let covariance = denoised.t().dot(&denoised) / F::cast(nobservations - 1);
    // calculate the standard deviation vector
    let std_deviation = denoised.var_axis(Axis(0), F::one()).mapv(|x| x.sqrt());

    // we will only save the upper triangular matrix as the diagonal is one and
    // the lower triangular is a mirror of the upper triangular part
    let mut pearson_coeffs = Array1::zeros(nfeatures * (nfeatures - 1) / 2);

    let mut k = 0;
    for i in 0..(nfeatures - 1) {
        for j in (i + 1)..nfeatures {
            // calculate pearson correlation coefficients by normalizing the covariance matrix
            pearson_coeffs[k] = covariance[(i, j)] / std_deviation[i] / std_deviation[j];

            k += 1;
        }
    }

    pearson_coeffs
}

/// Evidence of non-correlation with re-sampling test
///
/// The p-value supports or reject the null hypthesis that two variables are not correlated. A
/// small p-value indicates a strong evidence that two variables are correlated.
fn p_values<F: Float, D: Data<Elem = F>>(
    data: &ArrayBase<D, Ix2>,
    ground: &Array1<F>,
    num_iter: usize,
) -> Array1<F> {
    // transpose element matrix such that we can shuffle columns
    let (n, m) = (data.ncols(), data.nrows());
    let mut flattened = Vec::with_capacity(n * m);
    for i in 0..m {
        for j in 0..n {
            flattened.push(data[(i, j)]);
        }
    }

    let mut p_values = Array1::zeros(n * (n - 1) / 2);
    let mut rng = SmallRng::from_entropy();

    // calculate p-values by shuffling features `num_iter` times
    for _ in 0..num_iter {
        // shuffle all corresponding features
        for j in 0..n {
            flattened[j * m..(j + 1) * m].shuffle(&mut rng);
        }

        // create an ndarray and calculate the PCC for this distribution
        let arr_view = ArrayView2::from_shape((m, n), &flattened).unwrap();
        let correlation = pearson_correlation(&arr_view.t());

        // count the number of times that the re-shuffled distribution has a larger PCC than the
        // original distribution
        let greater = ground
            .iter()
            .zip(correlation.iter())
            .map(|(a, b)| {
                if a.abs() < b.abs() {
                    F::one()
                } else {
                    F::zero()
                }
            })
            .collect::<Array1<_>>();

        p_values += &greater;
    }

    // divide by the number of iterations to re-scale range
    p_values / F::cast(num_iter)
}

/// Pearson Correlation Coefficients (or Bivariate Coefficients)
///
/// The PCCs indicate the linear correlation between variables. This type also supports printing
/// the PCC as an upper triangle matrix together with the feature names.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PearsonCorrelation<F> {
    pearson_coeffs: Array1<F>,
    p_values: Array1<F>,
    feature_names: Vec<String>,
}

impl<F: Float> PearsonCorrelation<F> {
    /// Calculate the Pearson Correlation Coefficients and optionally p-values from dataset
    ///
    /// The PCC describes the linear correlation between two variables. It is the covariance divided by
    /// the product of the standard deviations, therefore essentially a normalised measurement of the
    /// covariance and in range (-1, 1). A negative coefficient indicates a negative correlation
    /// between both variables.
    ///
    /// The p-value supports or reject the null hypthesis that two variables are not correlated. A
    /// small p-value indicates a strong evidence that two variables are correlated.
    ///
    /// # Parameters
    ///
    /// * `dataset`: Data for the correlation analysis
    /// * `num_iter`: optionally number of iterations of the p-value test, if none then no p-value
    /// are calculate
    ///
    /// # Example
    ///
    /// ```
    /// let corr = linfa_datasets::diabetes()
    ///     .pearson_correlation_with_p_value(100);
    ///
    /// println!("{}", corr);
    /// ```
    ///
    /// The output looks like this (the p-value is in brackets behind the PCC):
    ///
    /// ```ignore
    /// age                        +0.17 (0.61) +0.18 (0.62) +0.33 (0.34) +0.26 (0.47) +0.22 (0.54) -0.07 (0.83) +0.20 (0.60) +0.27 (0.54) +0.30 (0.41)
    /// sex                                     +0.09 (0.74) +0.24 (0.59) +0.04 (0.91) +0.14 (0.74) -0.38 (0.28) +0.33 (0.30) +0.15 (0.74) +0.21 (0.58)
    /// body mass index                                      +0.39 (0.20) +0.25 (0.45) +0.26 (0.51) -0.37 (0.31) +0.41 (0.24) +0.45 (0.21) +0.39 (0.21)
    /// blood pressure                                                    +0.24 (0.54) +0.19 (0.56) -0.18 (0.61) +0.26 (0.45) +0.39 (0.20) +0.39 (0.16)
    /// t-cells                                                                        +0.90 (0.00) +0.05 (0.89) +0.54 (0.05) +0.52 (0.10) +0.33 (0.37)
    /// low-density lipoproteins                                                                    -0.20 (0.53) +0.66 (0.04) +0.32 (0.42) +0.29 (0.42)
    /// high-density lipoproteins                                                                                -0.74 (0.02) -0.40 (0.21) -0.27 (0.42)
    /// thyroid stimulating hormone                                                                                           +0.62 (0.04) +0.42 (0.21)
    /// lamotrigine                                                                                                                        +0.47 (0.14)
    /// blood sugar level
    /// ```

    pub fn from_dataset<D: Data<Elem = F>, T>(
        dataset: &DatasetBase<ArrayBase<D, Ix2>, T>,
        num_iter: Option<usize>,
    ) -> Self {
        // calculate pearson coefficients
        let pearson_coeffs = pearson_correlation(dataset.records());

        // calculate p values
        let p_values = match num_iter {
            Some(num_iter) => p_values(dataset.records(), &pearson_coeffs, num_iter),
            None => Array1::zeros(0),
        };

        PearsonCorrelation {
            pearson_coeffs,
            p_values,
            feature_names: dataset.feature_names(),
        }
    }

    /// Return the Pearson's Correlation Coefficients
    ///
    /// The coefficients are describing the linear correlation, normalized in range (-1, 1) between
    /// two variables. Because the correlation is commutative and PCC to the same variable is
    /// always perfectly correlated (i.e. 1), this function only returns the upper triangular
    /// matrix with (n-1)*n/2 elements.
    pub fn get_coeffs(&self) -> &Array1<F> {
        &self.pearson_coeffs
    }

    /// Return the p values supporting the null-hypothesis
    ///
    /// This implementation estimates the p value with the permutation test. As null-hypothesis
    /// the non-correlation between two variables is chosen such that the smaller the p-value the
    /// stronger we can reject the null-hypothesis and conclude that they are linearily correlated.
    pub fn get_p_values(&self) -> Option<&Array1<F>> {
        if self.p_values.is_empty() {
            None
        } else {
            Some(&self.p_values)
        }
    }
}

impl<F: Float, D: Data<Elem = F>, T> DatasetBase<ArrayBase<D, Ix2>, T> {
    /// Calculate the Pearson Correlation Coefficients from a dataset
    ///
    /// The PCC describes the linear correlation between two variables. It is the covariance divided by
    /// the product of the standard deviations, therefore essentially a normalised measurement of the
    /// covariance and in range (-1, 1). A negative coefficient indicates a negative correlation
    /// between both variables.
    ///
    /// # Example
    ///
    /// ```
    /// let corr = linfa_datasets::diabetes()
    ///     .pearson_correlation();
    ///
    /// println!("{}", corr);
    /// ```
    ///
    pub fn pearson_correlation(&self) -> PearsonCorrelation<F> {
        PearsonCorrelation::from_dataset(self, None)
    }

    /// Calculate the Pearson Correlation Coefficients and p-values from the dataset
    ///
    /// The PCC describes the linear correlation between two variables. It is the covariance divided by
    /// the product of the standard deviations, therefore essentially a normalised measurement of the
    /// covariance and in range (-1, 1). A negative coefficient indicates a negative correlation
    /// between both variables.
    ///
    /// The p-value supports or reject the null hypthesis that two variables are not correlated.
    /// The smaller the p-value the stronger is the evidence that two variables are correlated. A
    /// typical threshold is p < 0.05.
    ///
    /// # Parameters
    ///
    /// * `num_iter`: number of iterations of the permutation test to estimate the p-value
    ///
    /// # Example
    ///
    /// ```
    /// let corr = linfa_datasets::diabetes()
    ///     .pearson_correlation_with_p_value(100);
    ///
    /// println!("{}", corr);
    /// ```
    ///
    pub fn pearson_correlation_with_p_value(&self, num_iter: usize) -> PearsonCorrelation<F> {
        PearsonCorrelation::from_dataset(self, Some(num_iter))
    }
}

/// Display the Pearson's Correlation Coefficients as upper triangular matrix
///
/// This function prints the feature names for each row, the corresponding PCCs and optionally the
/// p-values in brackets after the PCCs.
impl<F: Float> fmt::Display for PearsonCorrelation<F> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let n = self.feature_names.len();
        let longest = self.feature_names.iter().map(|x| x.len()).max().unwrap();

        let mut k = 0;
        for i in 0..(n - 1) {
            write!(f, "{}", self.feature_names[i])?;
            for _ in 0..longest - self.feature_names[i].len() {
                write!(f, " ")?;
            }

            for _ in 0..i {
                write!(f, "             ")?;
            }

            for _ in (i + 1)..n {
                if !self.p_values.is_empty() {
                    write!(
                        f,
                        "{:+.2} ({:.2}) ",
                        self.pearson_coeffs[k], self.p_values[k]
                    )?;
                } else {
                    write!(f, "{:.2} ", self.pearson_coeffs[k])?;
                }

                k += 1;
            }
            writeln!(f,)?;
        }
        writeln!(f, "{}", self.feature_names[n - 1])?;

        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use crate::DatasetBase;
    use ndarray::{concatenate, Array, Axis};
    use ndarray_rand::{rand_distr::Uniform, RandomExt};
    use rand::{rngs::SmallRng, SeedableRng};

    #[test]
    fn uniform_random() {
        // create random number generator and random matrix with uniform distribution
        let mut rng = SmallRng::seed_from_u64(42);
        let data = Array::random_using((1000, 4), Uniform::new(-1., 1.), &mut rng);

        // calculate PCCs and test that they are indeed near zero
        let pcc = DatasetBase::from(data).pearson_correlation();
        assert!(pcc.get_coeffs().mapv(|x: f32| x.abs()).sum() < 5e-2 * 6.0);
    }

    #[test]
    fn perfectly_correlated() {
        let mut rng = SmallRng::seed_from_u64(42);
        let v = Array::random_using((4, 1), Uniform::new(0., 1.), &mut rng);

        // project feature with matrix
        let data = Array::random_using((1000, 1), Uniform::new(-1., 1.), &mut rng);
        let data_proj = data.dot(&v.t());

        let corr = DatasetBase::from(concatenate![Axis(1), data, data_proj])
            .pearson_correlation_with_p_value(100);

        assert!(corr.get_coeffs().mapv(|x| 1. - x).sum() < 1e-2);
        assert!(corr.get_p_values().unwrap().sum() < 1e-2);
    }
}