xdlol 0.0.0 - Docs.rs

//! Chi-square test
//!
//! # Reference
//!
//! Knuth, D. E. "The Art of Computer Programming, 3.3.1 General Test Procedures for Studying Random Data." _The Art of Computer Programming; Volume 2: Seminumerical Algorithms, Third Edition_: 42-60.
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

use std::f64;
use std::fmt::Debug;
use num::ToPrimitive;
use arithmetic::UnsignedArithmetic;
pub use statistics::FrequencyMap;

use statrs::function::gamma::{gamma, gamma_li};

/// Determine if the chi-square value is high for a distribution.
///
/// # Example
///
/// ```
/// # extern crate xdlol;
/// # extern crate rand;
/// #
/// # fn main() {
/// use xdlol::statistics::chi_square_test;
/// use rand::random;
///
/// let upper_bound = std::u8::MAX;
/// let mut random_samples: Vec<u8> = Vec::new();
/// for _ in 0..3000 {
///     random_samples.push(random::<u8>());
/// }
/// assert!(true, chi_square_test(random_samples, upper_bound as usize));
/// # }
/// ```
pub fn chi_square_test(samples: Vec<u8>, population_size: usize) -> bool {
    let sample_size = samples.iter().count();
    if sample_size <= 10 * population_size {
        return false;
    }

    let freq_map: FrequencyMap<u8, u64> = samples.iter().cloned().collect();
    if freq_map.values().any(|x| *x < 5) {
        return false;
    }

    if freq_map.values().count() != population_size {
        return false;
    }

    let chi_square = chi_square(freq_map.values().cloned());

    (chi_square - population_size as f64).abs()
        <= 2u64.pow((population_size as f64).sqrt() as u32) as f64
}

/// Calculates the chi-square value for n unsigned integers less than r
///
/// Assumes a uniform distribution.
///
/// # Example
///
/// ```
/// # extern crate xdlol;
/// # extern crate rand;
/// #
/// # fn main() {
/// use xdlol::statistics::chi_square;
/// use xdlol::statistics::FrequencyMap;
/// use rand::random;
///
/// let upper_bound = std::u8::MAX;
/// let mut samples: Vec<u8> = Vec::new();
/// for _ in 0..3000 {
///     samples.push(random::<u8>());
/// }
/// let freq_map: FrequencyMap<u8, u64> = samples.iter().cloned().collect();
///
/// println!("{}", chi_square(freq_map.values().cloned()));
/// # }
/// ```
///
/// # Reference
///
/// Knuth, D. E. "The Art of Computer Programming, 3.3.1.A Chi-square tests." _The Art of Computer Programming; Volume 2: Seminumerical Algorithms, Third Edition_: 42-48.
pub fn chi_square<T, I: IntoIterator<Item = T>>(freqs: I) -> f64
where
    I: Clone,
    T: ToPrimitive + UnsignedArithmetic + Debug,
{
    let num_categories = freqs.clone().into_iter().count();
    let num_samples = freqs
        .clone()
        .into_iter()
        .map(|v| v.to_usize().unwrap())
        .sum::<usize>();
    let expected = (num_samples / num_categories) as f64;
    freqs
        .into_iter()
        .map(|actual| (actual.to_i64().unwrap() - expected as i64).pow(2) as f64 / expected)
        .sum()
}

/// Calculates the P-Value.
///
/// The P-Value is the probability that the difference between the observed
/// and expected values would happen purely by chance. Two values are
/// required to calculate the P-Value, the _Degrees of Freedom_ (DOF) and
/// the _Critical Value_.
pub fn p_value(dof: u64, critical_value: f64) -> f64 {
    if critical_value < 0.0 || dof < 1 {
        return 0.0;
    }
    let K: f64 = dof as f64 * 0.5;
    let X: f64 = critical_value * 0.5;
    let e: f64 = f64::consts::E;
    if dof == 2 {
        return e.powf(-1.0 * X);
    }
    let mut p_value: f64 = gamma_li(K, X);
    if p_value.is_nan() || p_value.is_infinite() || p_value <= 1e-8 {
        return 1e-14;
    }
    p_value /= gamma(K);
    1.0 - p_value
}

#[cfg(test)]
mod tests {
    extern crate float_cmp;
    extern crate rand;

    use super::*;
    use self::float_cmp::ApproxEqUlps;
    use self::rand::random;

    // -------------------------------------------------------------------------
    // chi_square()
    // -------------------------------------------------------------------------

    // Chi Square tested against Pythons scipy.stats.chisquare function.
    //
    // ```python
    // >>> chisquare([1, 1, 3, 5, 0])
    // Power_divergenceResult(statistic=8.0, pvalue=0.091578194443670893)
    // ```
    //
    // Also verified by hand.
    #[test]
    fn test_chi_square() {
        let test_data: Vec<u8> = vec![1, 1, 3, 5, 0];
        let expected = 8_f64;
        let chi_square = chi_square(test_data);
        assert!(
            chi_square.approx_eq_ulps(&expected, 2),
            "Expected a chi-squared value of {}, found {}",
            expected,
            chi_square
        );
    }

    // Test for no panic with large input.
    #[test]
    fn test_chi_square_large_dataset() {
        let mut test_data: Vec<u8> = Vec::new();
        for _ in 0..10000 {
            test_data.push(random::<u8>());
        }
        chi_square(test_data);
    }

    // -------------------------------------------------------------------------
    // p_value()
    // -------------------------------------------------------------------------

    #[test]
    fn p_value_basic() {
        let expected = 0.0636423441307552;
        let p_value = p_value(255, 290.285192);
        assert!(
            p_value.approx_eq_ulps(&expected, 2),
            "Expected P-Value of {}, found {}",
            expected,
            p_value
        );
    }

    // Test result against Scipy
    //
    // Expected value generated with the Python Scipy library:
    //
    // ```python
    // >>> from scipy.stats import chisquare
    // >>> chisquare([1, 2, 3, 4])
    // ... Power_divergenceResult(statistic=2.0, pvalue=0.57240670447087982)
    // ```
    //
    // With different implementations of the `gamma` function a
    // larger difference is expected than two native calculations
    // so a larger _unit in the last place_ (ULPS) is used.
    #[test]
    fn p_value_against_scipy() {
        let test_data: Vec<u64> = vec![1, 2, 3, 4];
        let expected_pvalue = 0.57240670447087982;

        let num_samples = 10_f64;
        let num_categories = 4_f64;
        let expected_sample = num_samples / num_categories;

        let dof = (test_data.iter().count() - 1) as u64;
        let critical_value = test_data
            .iter()
            .map(|x| *x as f64 - expected_sample)
            .fold(0_f64, |acc, x| acc + (x * x) / expected_sample);

        let p_value = p_value(dof, critical_value);
        assert!(
            p_value.approx_eq_ulps(&expected_pvalue, 8),
            "Expected P-Value of {:.64}, found {:.64}",
            expected_pvalue,
            p_value
        );
    }

    // -------------------------------------------------------------------------
    // chi_square_test()
    // -------------------------------------------------------------------------

    // The test is invalid when the sample population is too small.
    // A good minimum to set is 10 times as many samples as the
    // population size.
    #[test]
    fn chi_square_test_failure_sample_size_too_small() {
        let test_data: Vec<u8> = vec![1, 2, 3, 4, 5];
        let population_size: usize = 5;
        assert!(
            !chi_square_test(test_data, population_size),
            "Test is invalid if sample size ({}) <= 10 * population size ({})",
            5,
            5
        );

        let test_data_edge_case: Vec<u8> = vec![
            2, 4, 4, 4, 3, 2, 3, 2, 4, 2, 2, 3, 3, 2, 3, 3, 3, 2, 4, 3, 4, 4, 4, 3, 3, 2, 3, 4, 2
        ];
        let population_size: usize = 3;
        assert!(
            !chi_square_test(test_data_edge_case, population_size),
            "Test is invalid if sample size ({}) <= 10 * population size ({})",
            29,
            30
        );
    }

    // This test is invalid when the observed or expected frequencies
    // in each category are too small. A typical rule is that all of
    // the observed and expected frequencies should be at least 5.
    //
    // The test below has a population size of 4: (1, 2, 3, 4). It
    // satisfies the sample size test with a 42 samples, and
    // there are only four samples of 1 to trigger failure. The
    // random samples after the 1s  were generated with python:
    //
    // ```python
    // >>> import random
    // >>> for _ in range(42):
    // ...   print(random.choice([2, 3, 4]), end=', ')
    // ```
    //
    // This data was tested to pass the chi-square test when
    // the code is removed to catch this and fail the test.
    #[test]
    fn chi_square_test_failure_element_frequency_too_small() {
        let test_data: Vec<u8> = vec![
            1, 1, 1, 1, 3, 3, 2, 4, 2, 4, 2, 4, 4, 4, 3, 2, 3, 2, 4, 2, 2, 3, 3, 2, 3, 3, 3, 2, 4,
            3, 4, 4, 4, 3, 3, 2, 3, 4, 2, 3, 3, 2,
        ];
        let population_size: usize = 4;
        assert!(
            !chi_square_test(test_data, population_size),
            "Test is invalid if not all frequencies are at least 5"
        );
    }

    // For the test to be valid each element must have been
    // sampled at least 5 times. Test that an element missing
    // completely from the set of samples also fails the test.
    //
    // Below the element `1` is missing.
    #[test]
    fn chi_square_test_failure_element_zero_frequency() {
        let test_data: Vec<u8> = vec![
            3, 2, 3, 3, 3, 3, 2, 4, 2, 4, 2, 4, 4, 4, 3, 2, 3, 2, 4, 2, 2, 3, 3, 2, 3, 3, 3, 2, 4,
            3, 4, 4, 4, 3, 3, 2, 3, 4, 2, 3, 3, 2,
        ];
        let population_size: usize = 4;
        assert!(
            !chi_square_test(test_data, population_size),
            "Test is invalid if not all frequencies are at least 5"
        );
    }

    // A random sample large enough to satisfy the test.
    //
    // Samples were generated with python:
    //
    // ```python
    // >>> import random
    // >>> for _ in range(42):
    // ...   pirnt(random.choice([1, 2, 3, 4]), end=', ')
    // ```
    #[test]
    fn chi_square_test_pass() {
        let test_data: Vec<u8> = vec![
            3, 1, 3, 4, 4, 3, 2, 3, 1, 1, 3, 2, 3, 1, 1, 3, 3, 1, 4, 3, 1, 2, 2, 4, 1, 4, 3, 2, 1,
            4, 1, 1, 2, 2, 3, 3, 1, 1, 4, 4, 2, 3,
        ];
        let population_size: usize = 4;
        assert!(chi_square_test(test_data, population_size));
    }
}