use std::collections::HashMap;
use std::fmt::Debug;
use std::hash::Hash;
use crate::dataframe::DataFrame;
use crate::error::{Error, PandRSError, Result};
use crate::series::{Categorical, CategoricalOrder, StringCategorical};
use crate::stats::ChiSquareResult;
#[derive(Debug, Clone)]
pub struct ContingencyTable {
pub observed: Vec<Vec<f64>>,
pub row_labels: Vec<String>,
pub col_labels: Vec<String>,
pub row_totals: Vec<f64>,
pub col_totals: Vec<f64>,
pub total: f64,
}
impl ContingencyTable {
pub fn new(
observed: Vec<Vec<f64>>,
row_labels: Option<Vec<String>>,
col_labels: Option<Vec<String>>,
) -> Result<Self> {
if observed.is_empty() {
return Err(Error::InsufficientData(
"Observed frequencies cannot be empty".into(),
));
}
let rows = observed.len();
let cols = observed[0].len();
for row in &observed {
if row.len() != cols {
return Err(Error::Consistency(
"All rows must have the same number of columns".into(),
));
}
}
let row_labels = match row_labels {
Some(labels) => {
if labels.len() != rows {
return Err(Error::Consistency(format!(
"Number of row labels ({}) does not match number of rows ({})",
labels.len(),
rows
)));
}
labels
}
None => (0..rows).map(|i| format!("Row_{}", i)).collect(),
};
let col_labels = match col_labels {
Some(labels) => {
if labels.len() != cols {
return Err(Error::Consistency(format!(
"Number of column labels ({}) does not match number of columns ({})",
labels.len(),
cols
)));
}
labels
}
None => (0..cols).map(|i| format!("Col_{}", i)).collect(),
};
let mut row_totals = vec![0.0; rows];
for i in 0..rows {
row_totals[i] = observed[i].iter().sum();
}
let mut col_totals = vec![0.0; cols];
for j in 0..cols {
col_totals[j] = (0..rows).map(|i| observed[i][j]).sum();
}
let total = row_totals.iter().sum();
Ok(ContingencyTable {
observed,
row_labels,
col_labels,
row_totals,
col_totals,
total,
})
}
pub fn expected_frequencies(&self) -> Vec<Vec<f64>> {
let rows = self.observed.len();
let cols = self.observed[0].len();
let mut expected = vec![vec![0.0; cols]; rows];
for i in 0..rows {
for j in 0..cols {
expected[i][j] = self.row_totals[i] * self.col_totals[j] / self.total;
}
}
expected
}
pub fn chi_square(&self, alpha: f64) -> Result<ChiSquareResult> {
let expected = self.expected_frequencies();
let mut chi2 = 0.0;
for i in 0..self.observed.len() {
for j in 0..self.observed[0].len() {
let o = self.observed[i][j];
let e = expected[i][j];
if e == 0.0 {
continue;
}
chi2 += (o - e).powi(2) / e;
}
}
let df = (self.observed.len() - 1) * (self.observed[0].len() - 1);
crate::stats::inference::chi_square_test_with_statistic(chi2, df, alpha, expected)
}
pub fn cramers_v(&self) -> Result<f64> {
let chi2_result = self.chi_square(0.05)?;
let n = self.total;
let min_dim = (self.observed.len() - 1).min(self.observed[0].len() - 1);
if min_dim == 0 {
return Err(Error::Consistency(
"Cannot calculate Cramer's V with only one row or column".into(),
));
}
let v = (chi2_result.chi2_statistic / (n * min_dim as f64)).sqrt();
Ok(v)
}
}
pub fn contingency_table<T, U>(
cat1: &Categorical<T>,
cat2: &Categorical<U>,
) -> Result<ContingencyTable>
where
T: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
U: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
{
let cat1_categories = cat1.categories();
let cat2_categories = cat2.categories();
let rows = cat1_categories.len();
let cols = cat2_categories.len();
let mut observed = vec![vec![0.0; cols]; rows];
if cat1.len() != cat2.len() {
return Err(Error::Consistency(format!(
"Categorical variables must have the same length, got {} and {}",
cat1.len(),
cat2.len()
)));
}
for i in 0..cat1.len() {
let cat1_code = cat1.codes()[i];
let cat2_code = cat2.codes()[i];
if cat1_code == -1 || cat2_code == -1 {
continue;
}
observed[cat1_code as usize][cat2_code as usize] += 1.0;
}
let row_labels = cat1_categories.iter().map(|c| format!("{}", c)).collect();
let col_labels = cat2_categories.iter().map(|c| format!("{}", c)).collect();
ContingencyTable::new(observed, Some(row_labels), Some(col_labels))
}
pub fn dataframe_contingency_table(
df: &DataFrame,
col1: &str,
col2: &str,
) -> Result<ContingencyTable> {
if !df.contains_column(col1) {
return Err(Error::Column(format!("Column '{}' does not exist", col1)));
}
if !df.contains_column(col2) {
return Err(Error::Column(format!("Column '{}' does not exist", col2)));
}
if !df.is_categorical(col1) {
return Err(Error::Consistency(format!(
"Column '{}' is not categorical",
col1
)));
}
if !df.is_categorical(col2) {
return Err(Error::Consistency(format!(
"Column '{}' is not categorical",
col2
)));
}
let cat1 = df.get_categorical::<String>(col1)?;
let cat2 = df.get_categorical::<String>(col2)?;
contingency_table(&cat1, &cat2)
}
pub fn chi_square_test_for_independence<T, U>(
cat1: &Categorical<T>,
cat2: &Categorical<U>,
alpha: f64,
) -> Result<ChiSquareResult>
where
T: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
U: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
{
let table = contingency_table(cat1, cat2)?;
table.chi_square(alpha)
}
pub fn dataframe_chi_square_test(
df: &DataFrame,
col1: &str,
col2: &str,
alpha: f64,
) -> Result<ChiSquareResult> {
let table = dataframe_contingency_table(df, col1, col2)?;
table.chi_square(alpha)
}
pub fn cramers_v<T, U>(cat1: &Categorical<T>, cat2: &Categorical<U>) -> Result<f64>
where
T: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
U: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
{
let table = contingency_table(cat1, cat2)?;
table.cramers_v()
}
pub fn dataframe_cramers_v(df: &DataFrame, col1: &str, col2: &str) -> Result<f64> {
let table = dataframe_contingency_table(df, col1, col2)?;
table.cramers_v()
}
pub fn categorical_anova<T>(
cat: &Categorical<T>,
numeric_values: &[f64],
alpha: f64,
) -> Result<crate::stats::AnovaResult>
where
T: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
{
if cat.len() != numeric_values.len() {
return Err(Error::Consistency(format!(
"Categorical and numeric variables must have the same length, got {} and {}",
cat.len(),
numeric_values.len()
)));
}
let mut groups: HashMap<&T, Vec<f64>> = HashMap::new();
for i in 0..cat.len() {
let code = cat.codes()[i];
if code == -1 {
continue;
}
let category = &cat.categories()[code as usize];
groups
.entry(category)
.or_insert_with(Vec::new)
.push(numeric_values[i]);
}
let mut cat_strings: Vec<String> = Vec::new();
let mut groups_vec: Vec<(String, Vec<f64>)> = Vec::new();
for (cat_val, values) in groups {
let cat_str = format!("{}", cat_val);
groups_vec.push((cat_str, values));
}
let mut anova_groups: HashMap<&str, Vec<f64>> = HashMap::new();
for (cat_str, _) in &groups_vec {
cat_strings.push(cat_str.clone());
}
for i in 0..groups_vec.len() {
anova_groups.insert(&cat_strings[i], groups_vec[i].1.clone());
}
crate::stats::anova(&anova_groups, alpha)
}
pub fn dataframe_categorical_anova(
df: &DataFrame,
cat_col: &str,
numeric_col: &str,
alpha: f64,
) -> Result<crate::stats::AnovaResult> {
if !df.contains_column(cat_col) {
return Err(Error::Column(format!(
"Column '{}' does not exist",
cat_col
)));
}
if !df.contains_column(numeric_col) {
return Err(Error::Column(format!(
"Column '{}' does not exist",
numeric_col
)));
}
if !df.is_categorical(cat_col) {
return Err(Error::Consistency(format!(
"Column '{}' is not categorical",
cat_col
)));
}
let cat = df.get_categorical::<String>(cat_col)?;
let numeric_values = df.get_column_numeric_values(numeric_col)?;
if !df.is_numeric_column(numeric_col) {
return Err(Error::InvalidValue(format!(
"Column '{}' is not numeric",
numeric_col
)));
}
categorical_anova(&cat, &numeric_values, alpha)
}
pub fn mode<T>(cat: &Categorical<T>) -> Result<Option<T>>
where
T: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
{
let mut counts = vec![0; cat.categories().len()];
for &code in cat.codes() {
if code != -1 {
counts[code as usize] += 1;
}
}
let mut max_count = 0;
let mut mode_index = None;
for (i, &count) in counts.iter().enumerate() {
if count > max_count {
max_count = count;
mode_index = Some(i);
}
}
match mode_index {
Some(index) => Ok(Some(cat.categories()[index].clone())),
None => Ok(None), }
}
pub fn entropy<T>(cat: &Categorical<T>) -> Result<f64>
where
T: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
{
let len = cat.len() as f64;
let mut counts = vec![0; cat.categories().len()];
let mut valid_count = 0;
for &code in cat.codes() {
if code != -1 {
counts[code as usize] += 1;
valid_count += 1;
}
}
if valid_count == 0 {
return Ok(0.0); }
let valid_count_f64 = valid_count as f64;
let mut entropy = 0.0;
for &count in &counts {
if count > 0 {
let p = count as f64 / valid_count_f64;
entropy -= p * p.log2();
}
}
Ok(entropy)
}
pub fn mutual_information<T, U>(cat1: &Categorical<T>, cat2: &Categorical<U>) -> Result<f64>
where
T: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
U: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
{
let table = contingency_table(cat1, cat2)?;
let mut mutual_info = 0.0;
let n = table.total;
for i in 0..table.observed.len() {
for j in 0..table.observed[0].len() {
let p_xy = table.observed[i][j] / n;
if p_xy > 0.0 {
let p_x = table.row_totals[i] / n;
let p_y = table.col_totals[j] / n;
mutual_info += p_xy * (p_xy / (p_x * p_y)).log2();
}
}
}
Ok(mutual_info)
}
pub fn normalized_mutual_information<T, U>(
cat1: &Categorical<T>,
cat2: &Categorical<U>,
) -> Result<f64>
where
T: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
U: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
{
let mi = mutual_information(cat1, cat2)?;
let h1 = entropy(cat1)?;
let h2 = entropy(cat2)?;
if h1 == 0.0 || h2 == 0.0 {
return Ok(0.0);
}
Ok(mi / (h1 * h2).sqrt())
}
pub fn dataframe_normalized_mutual_information(
df: &DataFrame,
col1: &str,
col2: &str,
) -> Result<f64> {
let cat1 = df.get_categorical::<String>(col1)?;
let cat2 = df.get_categorical::<String>(col2)?;
normalized_mutual_information(&cat1, &cat2)
}
pub fn frequency_distribution<T>(cat: &Categorical<T>) -> Result<HashMap<T, usize>>
where
T: Debug + Clone + Eq + Hash + std::fmt::Display + Ord,
{
let mut freq_dist = HashMap::new();
for i in 0..cat.len() {
let code = cat.codes()[i];
if code == -1 {
continue;
}
let category = &cat.categories()[code as usize];
*freq_dist.entry(category.clone()).or_insert(0) += 1;
}
Ok(freq_dist)
}