use crate::error::{StatsError, StatsResult};
use crate::prob::erf;
use crate::utils::constants::SQRT_2;
use num_traits::ToPrimitive;
use std::fmt::Debug;
pub fn chi_square_goodness_of_fit<T, U>(
observed: &[T],
expected: &[U],
) -> StatsResult<(f64, usize, f64)>
where
T: ToPrimitive + Debug + Copy,
U: ToPrimitive + Debug + Copy,
{
if observed.is_empty() {
return Err(StatsError::empty_data(
"Observed frequencies cannot be empty",
));
}
if expected.is_empty() {
return Err(StatsError::empty_data(
"Expected frequencies cannot be empty",
));
}
if observed.len() != expected.len() {
return Err(StatsError::dimension_mismatch(format!(
"Observed and expected frequencies must have the same length (got {} and {})",
observed.len(),
expected.len()
)));
}
let mut chi_square: f64 = 0.0;
let degrees_of_freedom = observed.len() - 1;
for i in 0..observed.len() {
let obs = observed[i].to_f64().ok_or_else(|| {
StatsError::conversion_error(format!(
"Failed to convert observed value at index {} to f64",
i
))
})?;
let exp = expected[i].to_f64().ok_or_else(|| {
StatsError::conversion_error(format!(
"Failed to convert expected value at index {} to f64",
i
))
})?;
if exp <= 0.0 {
return Err(StatsError::invalid_parameter(format!(
"Expected frequencies must be positive (got {} at index {})",
exp, i
)));
}
let diff = obs - exp;
chi_square += (diff * diff) / exp;
}
let p_value = if degrees_of_freedom > 0 {
let mut z = (chi_square / degrees_of_freedom as f64).powf(1.0 / 3.0)
- (1.0 - 2.0 / (9.0 * degrees_of_freedom as f64));
z /= (2.0 / (9.0 * degrees_of_freedom as f64)).sqrt();
0.5 * (1.0 + erf(z / SQRT_2)?)
} else {
1.0 };
Ok((chi_square, degrees_of_freedom, 1.0 - p_value))
}
pub fn chi_square_independence<T>(observed_matrix: &[Vec<T>]) -> StatsResult<(f64, usize, f64)>
where
T: ToPrimitive + Debug + Copy,
{
if observed_matrix.is_empty() {
return Err(StatsError::empty_data("Observed matrix cannot be empty"));
}
let row_count = observed_matrix.len();
let col_count = observed_matrix[0].len();
for (row_idx, row) in observed_matrix.iter().enumerate() {
if row.len() != col_count {
return Err(StatsError::dimension_mismatch(format!(
"All rows in the observed matrix must have the same length (row {} has length {}, expected {})",
row_idx,
row.len(),
col_count
)));
}
}
let mut row_sums: Vec<f64> = vec![0.0; row_count];
let mut col_sums: Vec<f64> = vec![0.0; col_count];
let mut total_sum = 0.0;
for i in 0..row_count {
for (j, col_sum) in col_sums.iter_mut().enumerate().take(col_count) {
let value = observed_matrix[i]
.get(j)
.ok_or_else(|| {
StatsError::index_out_of_bounds(format!(
"Index out of bounds: row {}, column {}",
i, j
))
})?
.to_f64()
.ok_or_else(|| {
StatsError::conversion_error(format!(
"Failed to convert observed value at row {}, column {} to f64",
i, j
))
})?;
row_sums[i] += value;
*col_sum += value;
total_sum += value;
}
}
let mut chi_square = 0.0;
for i in 0..row_count {
for (j, &col_sum) in col_sums.iter().enumerate().take(col_count) {
let expected = (row_sums[i] * col_sum) / total_sum;
if expected <= 0.0 {
return Err(StatsError::invalid_parameter(format!(
"Expected frequency must be positive (got {} at row {}, column {})",
expected, i, j
)));
}
let observed = observed_matrix[i]
.get(j)
.ok_or_else(|| {
StatsError::index_out_of_bounds(format!(
"Index out of bounds: row {}, column {}",
i, j
))
})?
.to_f64()
.ok_or_else(|| {
StatsError::conversion_error(format!(
"Failed to convert observed value at row {}, column {} to f64",
i, j
))
})?;
let diff = observed - expected;
chi_square += (diff * diff) / expected;
}
}
let degrees_of_freedom = (row_count - 1) * (col_count - 1);
let p_value = if degrees_of_freedom > 0 {
let mut z = (chi_square / degrees_of_freedom as f64).powf(1.0 / 3.0)
- (1.0 - 2.0 / (9.0 * degrees_of_freedom as f64));
z /= (2.0 / (9.0 * degrees_of_freedom as f64)).sqrt();
0.5 * (1.0 + erf(z / SQRT_2)?)
} else {
1.0 };
Ok((chi_square, degrees_of_freedom, 1.0 - p_value))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chi_square_goodness_of_fit_basic() {
let observed = vec![24, 20, 18, 22, 15, 21]; let expected = vec![20.0, 20.0, 20.0, 20.0, 20.0, 20.0];
let (statistic, df, _) = chi_square_goodness_of_fit(&observed, &expected).unwrap();
assert_eq!(df, 5, "Degrees of freedom should be 5");
let expected_chi_square = 2.5;
assert!(
(statistic - expected_chi_square).abs() < 1e-10,
"Chi-square statistic should be approximately {}",
expected_chi_square
);
}
#[test]
fn test_chi_square_goodness_of_fit_different_types() {
let observed = vec![10, 15, 25];
let expected = vec![16.6667, 16.6667, 16.6667];
let (statistic, df, _) = chi_square_goodness_of_fit(&observed, &expected).unwrap();
assert_eq!(df, 2, "Degrees of freedom should be 2");
let expected_chi_square = (10.0 - 16.6667) * (10.0 - 16.6667) / 16.6667
+ (15.0 - 16.6667) * (15.0 - 16.6667) / 16.6667
+ (25.0 - 16.6667) * (25.0 - 16.6667) / 16.6667;
assert!(
(statistic - expected_chi_square).abs() < 1e-4,
"Chi-square statistic should be approximately {}",
expected_chi_square
);
}
#[test]
fn test_chi_square_independence_basic() {
let observed = vec![vec![45, 55], vec![60, 40]];
let (statistic, df, _) = chi_square_independence(&observed).unwrap();
assert_eq!(df, 1, "Degrees of freedom should be 1");
println!("Actual chi-square statistic: {}", statistic);
let expected_chi_square = 4.51127;
assert!(
((statistic * 100_f64 / 100_f64) - expected_chi_square).abs() < 1e-3,
"Chi-square statistic should be approximately {}",
expected_chi_square
);
}
#[test]
fn test_chi_square_independence_large_matrix() {
let observed = vec![vec![30, 25, 15], vec![35, 40, 30], vec![20, 30, 25]];
let (statistic, df, _) = chi_square_independence(&observed).unwrap();
assert_eq!(df, 4, "Degrees of freedom should be 4");
assert!(statistic > 0.0, "Chi-square statistic should be positive");
}
#[test]
fn test_chi_square_goodness_of_fit_zero_expected() {
let observed = vec![10, 15, 20];
let expected = vec![15.0, 0.0, 30.0];
let result = chi_square_goodness_of_fit(&observed, &expected);
assert!(result.is_err());
assert!(matches!(
result.unwrap_err(),
StatsError::InvalidParameter { .. }
));
}
#[test]
fn test_chi_square_goodness_of_fit_mismatched_lengths() {
let observed = vec![10, 15, 20, 25];
let expected = vec![15.0, 20.0, 35.0];
let result = chi_square_goodness_of_fit(&observed, &expected);
assert!(result.is_err());
assert!(matches!(
result.unwrap_err(),
StatsError::DimensionMismatch { .. }
));
}
#[test]
fn test_chi_square_goodness_of_fit_empty_observed() {
let observed: Vec<u32> = vec![];
let expected = vec![15.0, 20.0, 35.0];
let result = chi_square_goodness_of_fit(&observed, &expected);
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), StatsError::EmptyData { .. }));
}
#[test]
fn test_chi_square_independence_empty_matrix() {
let observed: Vec<Vec<u32>> = vec![];
let result = chi_square_independence(&observed);
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), StatsError::EmptyData { .. }));
}
#[test]
fn test_chi_square_independence_mismatched_rows() {
let observed = vec![vec![10, 15], vec![20, 25, 30]];
let result = chi_square_independence(&observed);
assert!(result.is_err());
assert!(matches!(
result.unwrap_err(),
StatsError::DimensionMismatch { .. }
));
}
#[test]
fn test_chi_square_goodness_of_fit_df_zero() {
let observed = vec![10];
let expected = vec![10.0];
let (statistic, df, p_value) = chi_square_goodness_of_fit(&observed, &expected).unwrap();
assert_eq!(df, 0, "Degrees of freedom should be 0 for single category");
assert_eq!(
statistic, 0.0,
"Chi-square statistic should be 0 when observed equals expected"
);
assert_eq!(
p_value, 0.0,
"p-value should be 0.0 when df == 0 (1.0 - 1.0)"
);
}
#[test]
fn test_chi_square_goodness_of_fit_empty_expected() {
let observed = vec![10, 15, 20];
let expected: Vec<f64> = vec![];
let result = chi_square_goodness_of_fit(&observed, &expected);
assert!(result.is_err());
assert!(matches!(result.unwrap_err(), StatsError::EmptyData { .. }));
}
}