use crate::error::{Result, Error};
use crate::dataframe::DataFrame;
use crate::stats::{ChiSquareResult, AnovaResult};
use std::collections::{HashMap, HashSet};
use std::f64::consts::LN_2;
use std::fmt::Debug;
#[derive(Debug, Clone)]
pub struct ContingencyTable {
pub observed: Vec<Vec<f64>>,
pub row_labels: Vec<String>,
pub col_labels: Vec<String>,
pub row_totals: Vec<f64>,
pub col_totals: Vec<f64>,
pub total: f64,
}
impl ContingencyTable {
pub fn new() -> Self {
ContingencyTable {
observed: Vec::new(),
row_labels: Vec::new(),
col_labels: Vec::new(),
row_totals: Vec::new(),
col_totals: Vec::new(),
total: 0.0,
}
}
pub fn from_observed(
observed: Vec<Vec<f64>>,
row_labels: Vec<String>,
col_labels: Vec<String>,
) -> Result<Self> {
if observed.is_empty() {
return Err(Error::EmptyData("Contingency table requires data".into()));
}
let rows = observed.len();
let cols = observed[0].len();
if row_labels.len() != rows {
return Err(Error::DimensionMismatch(
format!("Number of row labels ({}) must match number of rows ({})",
row_labels.len(), rows)
));
}
if col_labels.len() != cols {
return Err(Error::DimensionMismatch(
format!("Number of column labels ({}) must match number of columns ({})",
col_labels.len(), cols)
));
}
for (i, row) in observed.iter().enumerate() {
if row.len() != cols {
return Err(Error::DimensionMismatch(
format!("Row {} has {} columns, expected {}", i, row.len(), cols)
));
}
}
let mut row_totals = vec![0.0; rows];
let mut col_totals = vec![0.0; cols];
let mut total = 0.0;
for i in 0..rows {
for j in 0..cols {
let value = observed[i][j];
if value < 0.0 {
return Err(Error::InvalidInput("Contingency table cannot contain negative values".into()));
}
row_totals[i] += value;
col_totals[j] += value;
total += value;
}
}
Ok(ContingencyTable {
observed,
row_labels,
col_labels,
row_totals,
col_totals,
total,
})
}
pub fn expected_frequencies(&self) -> Vec<Vec<f64>> {
let rows = self.observed.len();
let cols = self.observed[0].len();
let mut expected = vec![vec![0.0; cols]; rows];
for i in 0..rows {
for j in 0..cols {
expected[i][j] = self.row_totals[i] * self.col_totals[j] / self.total;
}
}
expected
}
pub fn chi_square_test(&self, alpha: f64) -> Result<ChiSquareResult> {
if alpha <= 0.0 || alpha >= 1.0 {
return Err(Error::InvalidInput("Significance level (alpha) must be between 0 and 1".into()));
}
let expected = self.expected_frequencies();
let mut chi2_stat = 0.0;
let rows = self.observed.len();
let cols = self.observed[0].len();
for i in 0..rows {
for j in 0..cols {
if expected[i][j] < 1e-10 {
return Err(Error::ComputationError("Expected frequency too small".into()));
}
let diff = self.observed[i][j] - expected[i][j];
chi2_stat += diff * diff / expected[i][j];
}
}
let df = (rows - 1) * (cols - 1);
let p_value = chi2_to_pvalue(chi2_stat, df);
Ok(ChiSquareResult {
chi2_statistic: chi2_stat,
p_value,
df,
significant: p_value < alpha,
expected_freq: expected,
})
}
pub fn cramers_v(&self) -> Result<f64> {
let rows = self.observed.len();
let cols = self.observed[0].len();
if rows < 2 || cols < 2 {
return Err(Error::InvalidInput("Cramer's V requires at least 2x2 contingency table".into()));
}
let chi_square_result = self.chi_square_test(0.05)?;
let n = self.total;
let min_dim = (rows - 1).min(cols - 1);
if n < 1.0 || min_dim == 0 {
return Err(Error::ComputationError("Invalid dimensions for Cramer's V".into()));
}
let v = (chi_square_result.chi2_statistic / (n * min_dim as f64)).sqrt();
Ok(v)
}
pub fn mutual_information(&self) -> Result<f64> {
let rows = self.observed.len();
let cols = self.observed[0].len();
if rows < 2 || cols < 2 {
return Err(Error::InvalidInput("Mutual information requires at least 2x2 contingency table".into()));
}
let expected = self.expected_frequencies();
let mut mi = 0.0;
for i in 0..rows {
for j in 0..cols {
let p_xy = self.observed[i][j] / self.total;
if p_xy < 1e-10 {
continue;
}
let p_x = self.row_totals[i] / self.total;
let p_y = self.col_totals[j] / self.total;
mi += p_xy * (p_xy / (p_x * p_y)).ln();
}
}
mi /= LN_2;
Ok(mi)
}
pub fn normalized_mutual_information(&self) -> Result<f64> {
let mi = self.mutual_information()?;
let mut h_x = 0.0;
let mut h_y = 0.0;
for i in 0..self.row_totals.len() {
let p = self.row_totals[i] / self.total;
if p > 0.0 {
h_x -= p * p.ln();
}
}
for j in 0..self.col_totals.len() {
let p = self.col_totals[j] / self.total;
if p > 0.0 {
h_y -= p * p.ln();
}
}
h_x /= LN_2;
h_y /= LN_2;
if h_x < 1e-10 || h_y < 1e-10 {
return Ok(0.0);
}
let nmi = 2.0 * mi / (h_x + h_y);
Ok(nmi)
}
}
pub(crate) fn dataframe_contingency_table(
df: &DataFrame,
col1: &str,
col2: &str
) -> Result<ContingencyTable> {
if !df.has_column(col1) {
return Err(Error::InvalidColumn(format!("Column '{}' does not exist", col1)));
}
if !df.has_column(col2) {
return Err(Error::InvalidColumn(format!("Column '{}' does not exist", col2)));
}
let series1 = df.get_column(col1)?;
let series2 = df.get_column(col2)?;
let values1 = series1.as_str()?;
let values2 = series2.as_str()?;
if values1.len() != values2.len() {
return Err(Error::DimensionMismatch(
format!("Column lengths do not match: {} has length {}, {} has length {}",
col1, values1.len(), col2, values2.len())
));
}
let mut unique1 = HashSet::new();
let mut unique2 = HashSet::new();
for value in &values1 {
unique1.insert(value.clone());
}
for value in &values2 {
unique2.insert(value.clone());
}
let mut row_labels: Vec<_> = unique1.iter().cloned().collect();
let mut col_labels: Vec<_> = unique2.iter().cloned().collect();
row_labels.sort();
col_labels.sort();
let row_map: HashMap<_, _> = row_labels.iter()
.enumerate()
.map(|(i, label)| (label.clone(), i))
.collect();
let col_map: HashMap<_, _> = col_labels.iter()
.enumerate()
.map(|(i, label)| (label.clone(), i))
.collect();
let rows = row_labels.len();
let cols = col_labels.len();
let mut observed = vec![vec![0.0; cols]; rows];
for (val1, val2) in values1.iter().zip(values2.iter()) {
let row = row_map[val1];
let col = col_map[val2];
observed[row][col] += 1.0;
}
ContingencyTable::from_observed(observed, row_labels, col_labels)
}
pub(crate) fn dataframe_chi_square_test(
df: &DataFrame,
col1: &str,
col2: &str,
alpha: f64
) -> Result<ChiSquareResult> {
let contingency_table = dataframe_contingency_table(df, col1, col2)?;
contingency_table.chi_square_test(alpha)
}
pub(crate) fn dataframe_cramers_v(
df: &DataFrame,
col1: &str,
col2: &str
) -> Result<f64> {
let contingency_table = dataframe_contingency_table(df, col1, col2)?;
contingency_table.cramers_v()
}
pub(crate) fn dataframe_normalized_mutual_information(
df: &DataFrame,
col1: &str,
col2: &str
) -> Result<f64> {
let contingency_table = dataframe_contingency_table(df, col1, col2)?;
contingency_table.normalized_mutual_information()
}
pub(crate) fn dataframe_categorical_anova(
df: &DataFrame,
cat_col: &str,
numeric_col: &str,
alpha: f64
) -> Result<AnovaResult> {
if !df.has_column(cat_col) {
return Err(Error::InvalidColumn(format!("Column '{}' does not exist", cat_col)));
}
if !df.has_column(numeric_col) {
return Err(Error::InvalidColumn(format!("Column '{}' does not exist", numeric_col)));
}
let cat_series = df.get_column(cat_col)?;
let cat_values = cat_series.as_str()?;
let num_series = df.get_column(numeric_col)?;
let num_values = num_series.as_f64()?;
if cat_values.len() != num_values.len() {
return Err(Error::DimensionMismatch(
format!("Column lengths do not match: {} has length {}, {} has length {}",
cat_col, cat_values.len(), numeric_col, num_values.len())
));
}
let mut groups: HashMap<String, Vec<f64>> = HashMap::new();
for (cat, num) in cat_values.iter().zip(num_values.iter()) {
groups.entry(cat.clone())
.or_insert_with(Vec::new)
.push(*num);
}
let mut anova_groups: HashMap<&str, &[f64]> = HashMap::new();
for (cat, values) in &groups {
anova_groups.insert(cat.as_str(), values.as_slice());
}
anova_impl(&anova_groups, alpha)
}
fn chi2_to_pvalue(chi2: f64, df: usize) -> f64 {
let k = df as f64 / 2.0;
let x = chi2 / 2.0;
let gamma_k = if df % 2 == 0 {
1.0 } else {
std::f64::consts::PI.sqrt() };
let p = if chi2 > df as f64 + 2.0 {
1.0 - gamma_k * (1.0 - x.exp() * (1.0 + x + 0.5 * x.powi(2)))
} else {
gamma_k * x.exp() * x.powf(k - 1.0)
};
1.0 - p.min(1.0).max(0.0)
}
fn anova_impl(
groups: &HashMap<&str, &[f64]>,
alpha: f64
) -> Result<AnovaResult> {
if groups.is_empty() {
return Err(Error::EmptyData("ANOVA requires at least one group".into()));
}
if groups.len() < 2 {
return Err(Error::InsufficientData("ANOVA requires at least two groups".into()));
}
let mut total_n = 0;
let mut global_sum = 0.0;
for (_, values) in groups.iter() {
if values.is_empty() {
return Err(Error::EmptyData("There is an empty group".into()));
}
total_n += values.len();
global_sum += values.iter().sum::<f64>();
}
let global_mean = global_sum / total_n as f64;
let mut ss_between = 0.0;
let mut ss_within = 0.0;
let mut ss_total = 0.0;
for (_, values) in groups.iter() {
let group_n = values.len();
let group_mean = values.iter().sum::<f64>() / group_n as f64;
ss_between += group_n as f64 * (group_mean - global_mean).powi(2);
for &value in *values {
ss_within += (value - group_mean).powi(2);
ss_total += (value - global_mean).powi(2);
}
}
let df_between = groups.len() - 1;
let df_within = total_n - groups.len();
let df_total = total_n - 1;
let ms_between = ss_between / df_between as f64;
let ms_within = ss_within / df_within as f64;
let f_statistic = ms_between / ms_within;
let p_value = 1.0 - f_distribution_cdf(f_statistic, df_between, df_within);
Ok(AnovaResult {
f_statistic,
p_value,
ss_between,
ss_within,
ss_total,
df_between,
df_within,
df_total,
ms_between,
ms_within,
significant: p_value < alpha,
})
}
fn f_distribution_cdf(f: f64, df1: usize, df2: usize) -> f64 {
let df1_f64 = df1 as f64;
let df2_f64 = df2 as f64;
let x = df1_f64 * f / (df1_f64 * f + df2_f64);
let a = df1_f64 / 2.0;
let b = df2_f64 / 2.0;
let beta_approx = if x > 0.5 {
1.0 - (1.0 - x).powf(b) * (1.0 + (1.0 - x) * a / b +
(1.0 - x).powi(2) * a * (a + 1.0) / (b * (b + 1.0)) / 2.0)
} else {
x.powf(a) * (1.0 + x * b / a +
x.powi(2) * b * (b + 1.0) / (a * (a + 1.0)) / 2.0)
};
beta_approx.min(1.0).max(0.0)
}
pub fn mode<T: AsRef<str> + Clone>(data: &[T]) -> Result<Vec<String>> {
if data.is_empty() {
return Err(Error::EmptyData("Mode calculation requires data".into()));
}
let mut counts: HashMap<String, usize> = HashMap::new();
for item in data {
let key = item.as_ref().to_string();
*counts.entry(key).or_insert(0) += 1;
}
let max_count = counts.values().max().unwrap_or(&0);
let modes: Vec<String> = counts.iter()
.filter(|(_, &count)| count == *max_count)
.map(|(category, _)| category.clone())
.collect();
Ok(modes)
}
pub fn entropy<T: AsRef<str>>(data: &[T]) -> Result<f64> {
if data.is_empty() {
return Err(Error::EmptyData("Entropy calculation requires data".into()));
}
let mut counts: HashMap<String, usize> = HashMap::new();
let n = data.len();
for item in data {
let key = item.as_ref().to_string();
*counts.entry(key).or_insert(0) += 1;
}
let mut entropy = 0.0;
for &count in counts.values() {
let p = count as f64 / n as f64;
entropy -= p * p.ln();
}
entropy /= LN_2;
Ok(entropy)
}
pub fn frequency_distribution<T: AsRef<str>>(
data: &[T]
) -> Result<HashMap<String, (usize, f64)>> {
if data.is_empty() {
return Err(Error::EmptyData("Frequency calculation requires data".into()));
}
let mut counts: HashMap<String, usize> = HashMap::new();
let n = data.len();
for item in data {
let key = item.as_ref().to_string();
*counts.entry(key).or_insert(0) += 1;
}
let mut distribution = HashMap::new();
for (category, count) in counts {
let relative_freq = count as f64 / n as f64;
distribution.insert(category, (count, relative_freq));
}
Ok(distribution)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::dataframe::DataFrame;
use crate::series::Series;
#[test]
fn test_contingency_table() {
let observed = vec![
vec![10.0, 20.0, 30.0],
vec![40.0, 50.0, 60.0],
];
let row_labels = vec!["Row1".to_string(), "Row2".to_string()];
let col_labels = vec!["Col1".to_string(), "Col2".to_string(), "Col3".to_string()];
let table = ContingencyTable::from_observed(observed, row_labels, col_labels).expect("operation should succeed");
assert_eq!(table.row_totals, vec![60.0, 150.0]);
assert_eq!(table.col_totals, vec![50.0, 70.0, 90.0]);
assert_eq!(table.total, 210.0);
let expected = table.expected_frequencies();
assert_eq!(expected.len(), 2);
assert_eq!(expected[0].len(), 3);
assert!((expected[0][0] - 14.29).abs() < 0.01);
}
#[test]
fn test_chi_square_test() {
let observed = vec![
vec![40.0, 10.0],
vec![10.0, 40.0],
];
let row_labels = vec!["Row1".to_string(), "Row2".to_string()];
let col_labels = vec!["Col1".to_string(), "Col2".to_string()];
let table = ContingencyTable::from_observed(observed, row_labels, col_labels).expect("operation should succeed");
let result = table.chi_square_test(0.05).expect("operation should succeed");
assert!(result.chi2_statistic > 0.0);
assert!(result.p_value < 0.05);
assert!(result.significant);
assert_eq!(result.df, 1); }
#[test]
fn test_cramers_v() {
let observed = vec![
vec![10.0, 0.0],
vec![0.0, 10.0],
];
let row_labels = vec!["Row1".to_string(), "Row2".to_string()];
let col_labels = vec!["Col1".to_string(), "Col2".to_string()];
let table = ContingencyTable::from_observed(observed, row_labels, col_labels).expect("operation should succeed");
let v = table.cramers_v().expect("operation should succeed");
assert!((v - 1.0).abs() < 0.01);
let observed_no_assoc = vec![
vec![10.0, 10.0],
vec![10.0, 10.0],
];
let table_no_assoc = ContingencyTable::from_observed(
observed_no_assoc,
row_labels.clone(),
col_labels.clone()
).expect("operation should succeed");
let v_no_assoc = table_no_assoc.cramers_v().expect("operation should succeed");
assert!(v_no_assoc < 0.01);
}
#[test]
fn test_mutual_information() {
let observed = vec![
vec![10.0, 0.0],
vec![0.0, 10.0],
];
let row_labels = vec!["Row1".to_string(), "Row2".to_string()];
let col_labels = vec!["Col1".to_string(), "Col2".to_string()];
let table = ContingencyTable::from_observed(observed, row_labels, col_labels).expect("operation should succeed");
let mi = table.mutual_information().expect("operation should succeed");
assert!((mi - 1.0).abs() < 0.01);
let nmi = table.normalized_mutual_information().expect("operation should succeed");
assert!((nmi - 1.0).abs() < 0.01);
}
#[test]
fn test_dataframe_contingency_table() {
let mut df = DataFrame::new();
let cat1 = vec!["A", "A", "A", "B", "B", "B"]
.into_iter().map(|s| s.to_string()).collect::<Vec<_>>();
let cat2 = vec!["X", "X", "Y", "X", "Y", "Y"]
.into_iter().map(|s| s.to_string()).collect::<Vec<_>>();
df.add_column("cat1".to_string(),
Series::new(cat1, Some("cat1".to_string())).expect("operation should succeed")).expect("operation should succeed");
df.add_column("cat2".to_string(),
Series::new(cat2, Some("cat2".to_string())).expect("operation should succeed")).expect("operation should succeed");
let table = dataframe_contingency_table(&df, "cat1", "cat2").expect("operation should succeed");
assert_eq!(table.observed.len(), 2); assert_eq!(table.observed[0].len(), 2);
assert_eq!(table.observed[0][0], 2.0); assert_eq!(table.observed[0][1], 1.0); assert_eq!(table.observed[1][0], 1.0); assert_eq!(table.observed[1][1], 2.0); }
#[test]
fn test_categorical_anova() {
let mut df = DataFrame::new();
let cat = vec!["A", "A", "A", "B", "B", "B", "C", "C", "C"]
.into_iter().map(|s| s.to_string()).collect::<Vec<_>>();
let num = vec![1.0, 1.1, 0.9, 4.9, 5.1, 5.0, 9.8, 10.1, 10.1];
df.add_column("category".to_string(),
Series::new(cat, Some("category".to_string())).expect("operation should succeed")).expect("operation should succeed");
df.add_column("value".to_string(),
Series::new(num, Some("value".to_string())).expect("operation should succeed")).expect("operation should succeed");
let result = dataframe_categorical_anova(&df, "category", "value", 0.05).expect("operation should succeed");
assert!(result.f_statistic > 100.0);
assert!(result.p_value < 0.001);
assert!(result.significant);
}
#[test]
fn test_mode() {
let data = vec!["A", "B", "A", "C", "B", "A"];
let result = mode(&data).expect("operation should succeed");
assert_eq!(result, vec!["A"]);
let data_tie = vec!["A", "B", "A", "B", "C", "C"];
let result_tie = mode(&data_tie).expect("operation should succeed");
assert_eq!(result_tie.len(), 2);
assert!(result_tie.contains(&"A".to_string()));
assert!(result_tie.contains(&"B".to_string()));
}
#[test]
fn test_entropy() {
let uniform = vec!["A", "B", "C", "D"];
let entropy_uniform = entropy(&uniform).expect("operation should succeed");
assert!((entropy_uniform - 2.0).abs() < 0.01);
let single = vec!["A", "A", "A", "A"];
let entropy_single = entropy(&single).expect("operation should succeed");
assert!(entropy_single < 0.01);
let mixed = vec!["A", "A", "B", "C", "C", "C", "C"];
let entropy_mixed = entropy(&mixed).expect("operation should succeed");
assert!(entropy_mixed > 0.0 && entropy_mixed < 2.0);
}
#[test]
fn test_frequency_distribution() {
let data = vec!["A", "B", "A", "C", "B", "A"];
let freq = frequency_distribution(&data).expect("operation should succeed");
assert_eq!(freq.len(), 3);
let (a_count, a_freq) = freq.get("A").expect("operation should succeed");
assert_eq!(*a_count, 3);
assert!((a_freq - 0.5).abs() < 0.01);
let (b_count, b_freq) = freq.get("B").expect("operation should succeed");
assert_eq!(*b_count, 2);
assert!((b_freq - 0.333).abs() < 0.01);
let (c_count, c_freq) = freq.get("C").expect("operation should succeed");
assert_eq!(*c_count, 1);
assert!((c_freq - 0.167).abs() < 0.01); }
}