use std::f64;
use std::fmt::Debug;
use num::ToPrimitive;
use arithmetic::UnsignedArithmetic;
pub use statistics::FrequencyMap;
use statrs::function::gamma::{gamma, gamma_li};
pub fn chi_square_test(samples: Vec<u8>, population_size: usize) -> bool {
let sample_size = samples.iter().count();
if sample_size <= 10 * population_size {
return false;
}
let freq_map: FrequencyMap<u8, u64> = samples.iter().cloned().collect();
if freq_map.values().any(|x| *x < 5) {
return false;
}
if freq_map.values().count() != population_size {
return false;
}
let chi_square = chi_square(freq_map.values().cloned());
(chi_square - population_size as f64).abs()
<= 2u64.pow((population_size as f64).sqrt() as u32) as f64
}
pub fn chi_square<T, I: IntoIterator<Item = T>>(freqs: I) -> f64
where
I: Clone,
T: ToPrimitive + UnsignedArithmetic + Debug,
{
let num_categories = freqs.clone().into_iter().count();
let num_samples = freqs
.clone()
.into_iter()
.map(|v| v.to_usize().unwrap())
.sum::<usize>();
let expected = (num_samples / num_categories) as f64;
freqs
.into_iter()
.map(|actual| (actual.to_i64().unwrap() - expected as i64).pow(2) as f64 / expected)
.sum()
}
pub fn p_value(dof: u64, critical_value: f64) -> f64 {
if critical_value < 0.0 || dof < 1 {
return 0.0;
}
let K: f64 = dof as f64 * 0.5;
let X: f64 = critical_value * 0.5;
let e: f64 = f64::consts::E;
if dof == 2 {
return e.powf(-1.0 * X);
}
let mut p_value: f64 = gamma_li(K, X);
if p_value.is_nan() || p_value.is_infinite() || p_value <= 1e-8 {
return 1e-14;
}
p_value /= gamma(K);
1.0 - p_value
}
#[cfg(test)]
mod tests {
extern crate float_cmp;
extern crate rand;
use super::*;
use self::float_cmp::ApproxEqUlps;
use self::rand::random;
#[test]
fn test_chi_square() {
let test_data: Vec<u8> = vec![1, 1, 3, 5, 0];
let expected = 8_f64;
let chi_square = chi_square(test_data);
assert!(
chi_square.approx_eq_ulps(&expected, 2),
"Expected a chi-squared value of {}, found {}",
expected,
chi_square
);
}
#[test]
fn test_chi_square_large_dataset() {
let mut test_data: Vec<u8> = Vec::new();
for _ in 0..10000 {
test_data.push(random::<u8>());
}
chi_square(test_data);
}
#[test]
fn p_value_basic() {
let expected = 0.0636423441307552;
let p_value = p_value(255, 290.285192);
assert!(
p_value.approx_eq_ulps(&expected, 2),
"Expected P-Value of {}, found {}",
expected,
p_value
);
}
#[test]
fn p_value_against_scipy() {
let test_data: Vec<u64> = vec![1, 2, 3, 4];
let expected_pvalue = 0.57240670447087982;
let num_samples = 10_f64;
let num_categories = 4_f64;
let expected_sample = num_samples / num_categories;
let dof = (test_data.iter().count() - 1) as u64;
let critical_value = test_data
.iter()
.map(|x| *x as f64 - expected_sample)
.fold(0_f64, |acc, x| acc + (x * x) / expected_sample);
let p_value = p_value(dof, critical_value);
assert!(
p_value.approx_eq_ulps(&expected_pvalue, 8),
"Expected P-Value of {:.64}, found {:.64}",
expected_pvalue,
p_value
);
}
#[test]
fn chi_square_test_failure_sample_size_too_small() {
let test_data: Vec<u8> = vec![1, 2, 3, 4, 5];
let population_size: usize = 5;
assert!(
!chi_square_test(test_data, population_size),
"Test is invalid if sample size ({}) <= 10 * population size ({})",
5,
5
);
let test_data_edge_case: Vec<u8> = vec![
2, 4, 4, 4, 3, 2, 3, 2, 4, 2, 2, 3, 3, 2, 3, 3, 3, 2, 4, 3, 4, 4, 4, 3, 3, 2, 3, 4, 2
];
let population_size: usize = 3;
assert!(
!chi_square_test(test_data_edge_case, population_size),
"Test is invalid if sample size ({}) <= 10 * population size ({})",
29,
30
);
}
#[test]
fn chi_square_test_failure_element_frequency_too_small() {
let test_data: Vec<u8> = vec![
1, 1, 1, 1, 3, 3, 2, 4, 2, 4, 2, 4, 4, 4, 3, 2, 3, 2, 4, 2, 2, 3, 3, 2, 3, 3, 3, 2, 4,
3, 4, 4, 4, 3, 3, 2, 3, 4, 2, 3, 3, 2,
];
let population_size: usize = 4;
assert!(
!chi_square_test(test_data, population_size),
"Test is invalid if not all frequencies are at least 5"
);
}
#[test]
fn chi_square_test_failure_element_zero_frequency() {
let test_data: Vec<u8> = vec![
3, 2, 3, 3, 3, 3, 2, 4, 2, 4, 2, 4, 4, 4, 3, 2, 3, 2, 4, 2, 2, 3, 3, 2, 3, 3, 3, 2, 4,
3, 4, 4, 4, 3, 3, 2, 3, 4, 2, 3, 3, 2,
];
let population_size: usize = 4;
assert!(
!chi_square_test(test_data, population_size),
"Test is invalid if not all frequencies are at least 5"
);
}
#[test]
fn chi_square_test_pass() {
let test_data: Vec<u8> = vec![
3, 1, 3, 4, 4, 3, 2, 3, 1, 1, 3, 2, 3, 1, 1, 3, 3, 1, 4, 3, 1, 2, 2, 4, 1, 4, 3, 2, 1,
4, 1, 1, 2, 2, 3, 3, 1, 1, 4, 4, 2, 3,
];
let population_size: usize = 4;
assert!(chi_square_test(test_data, population_size));
}
}