use crate::error::InsightError;
pub fn validate_clean_data(columns: &[Vec<f64>], names: &[String]) -> Result<(), InsightError> {
if columns.is_empty() {
return Err(InsightError::DegenerateData {
reason: "no columns provided".into(),
});
}
let n_rows = columns[0].len();
if n_rows < 2 {
return Err(InsightError::InsufficientData {
min_required: 2,
actual: n_rows,
});
}
for (i, col) in columns.iter().enumerate() {
if col.len() != n_rows {
return Err(InsightError::DimensionMismatch {
expected: n_rows,
actual: col.len(),
});
}
let name = names.get(i).cloned().unwrap_or_else(|| format!("col_{i}"));
let nan_count = col.iter().filter(|v| v.is_nan()).count();
if nan_count > 0 {
return Err(InsightError::MissingValues {
column: name,
count: nan_count,
});
}
let inf_count = col.iter().filter(|v| v.is_infinite()).count();
if inf_count > 0 {
return Err(InsightError::DegenerateData {
reason: format!(
"column '{name}' contains {inf_count} non-finite (infinite) value(s)"
),
});
}
}
Ok(())
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CorrelationMethod {
Pearson,
Spearman,
Kendall,
}
#[derive(Debug, Clone)]
pub struct CorrelationConfig {
pub method: CorrelationMethod,
pub high_threshold: f64,
}
impl Default for CorrelationConfig {
fn default() -> Self {
Self {
method: CorrelationMethod::Pearson,
high_threshold: 0.7,
}
}
}
#[derive(Debug, Clone)]
pub struct CorrelationPair {
pub col_a: String,
pub col_b: String,
pub r: f64,
pub p_value: f64,
}
#[derive(Debug, Clone)]
pub struct CorrelationAnalysis {
pub method: CorrelationMethod,
pub matrix: u_numflow::matrix::Matrix,
pub high_pairs: Vec<CorrelationPair>,
pub names: Vec<String>,
}
pub fn correlation_analysis(
columns: &[Vec<f64>],
names: &[String],
config: &CorrelationConfig,
) -> Result<CorrelationAnalysis, InsightError> {
validate_clean_data(columns, names)?;
let refs: Vec<&[f64]> = columns.iter().map(|c| c.as_slice()).collect();
let matrix = match config.method {
CorrelationMethod::Pearson => u_analytics::correlation::correlation_matrix(&refs),
CorrelationMethod::Spearman => u_analytics::correlation::spearman_matrix(&refs),
CorrelationMethod::Kendall => u_analytics::correlation::kendall_matrix(&refs),
}
.ok_or_else(|| InsightError::DegenerateData {
reason: "correlation matrix computation failed (possible constant columns or zero-variance data)".into(),
})?;
let n = columns.len();
let mut high_pairs = Vec::new();
for i in 0..n {
for j in (i + 1)..n {
let r = matrix.get(i, j);
if r.abs() > config.high_threshold {
let pair_result = match config.method {
CorrelationMethod::Pearson => {
u_analytics::correlation::pearson(refs[i], refs[j])
}
CorrelationMethod::Spearman => {
u_analytics::correlation::spearman(refs[i], refs[j])
}
CorrelationMethod::Kendall => {
u_analytics::correlation::kendall_tau_b(refs[i], refs[j])
}
};
let p_value = pair_result.map_or(f64::NAN, |pr| pr.p_value);
high_pairs.push(CorrelationPair {
col_a: names[i].clone(),
col_b: names[j].clone(),
r,
p_value,
});
}
}
}
high_pairs.sort_by(|a, b| {
b.r.abs()
.partial_cmp(&a.r.abs())
.unwrap_or(std::cmp::Ordering::Equal)
});
Ok(CorrelationAnalysis {
method: config.method,
matrix,
high_pairs,
names: names.to_vec(),
})
}
#[derive(Debug, Clone)]
pub struct RegressionAnalysis {
pub target_name: String,
pub predictor_names: Vec<String>,
pub r_squared: f64,
pub adj_r_squared: f64,
pub coefficients: Vec<f64>,
pub p_values: Vec<f64>,
pub vif: Vec<f64>,
pub f_p_value: f64,
}
pub fn regression_analysis(
predictors: &[Vec<f64>],
predictor_names: &[String],
target: &[f64],
target_name: &str,
) -> Result<RegressionAnalysis, InsightError> {
if predictors.is_empty() {
return Err(InsightError::InvalidParameter {
name: "predictors".into(),
message: "at least 1 predictor required".into(),
});
}
let n = target.len();
if n < 3 {
return Err(InsightError::InsufficientData {
min_required: 3,
actual: n,
});
}
let nan_count = target.iter().filter(|v| v.is_nan()).count();
if nan_count > 0 {
return Err(InsightError::MissingValues {
column: target_name.to_string(),
count: nan_count,
});
}
let mut all_names = vec![target_name.to_string()];
all_names.extend(predictor_names.iter().cloned());
let mut all_cols = vec![target.to_vec()];
all_cols.extend(predictors.iter().cloned());
validate_clean_data(&all_cols, &all_names)?;
if predictors.len() == 1 {
let result = u_analytics::regression::simple_linear_regression(&predictors[0], target)
.ok_or_else(|| InsightError::ComputationFailed {
operation: "simple linear regression".into(),
detail: "computation returned None (possible degenerate data)".into(),
})?;
Ok(RegressionAnalysis {
target_name: target_name.to_string(),
predictor_names: predictor_names.to_vec(),
r_squared: result.r_squared,
adj_r_squared: result.adjusted_r_squared,
coefficients: vec![result.intercept, result.slope],
p_values: vec![result.intercept_p, result.slope_p],
vif: Vec::new(),
f_p_value: result.f_p_value,
})
} else {
let x_refs: Vec<&[f64]> = predictors.iter().map(|c| c.as_slice()).collect();
let result = u_analytics::regression::multiple_linear_regression(&x_refs, target)
.ok_or_else(|| InsightError::ComputationFailed {
operation: "multiple linear regression".into(),
detail: "computation returned None (possible rank deficiency or singular data)"
.into(),
})?;
Ok(RegressionAnalysis {
target_name: target_name.to_string(),
predictor_names: predictor_names.to_vec(),
r_squared: result.r_squared,
adj_r_squared: result.adjusted_r_squared,
coefficients: result.coefficients.clone(),
p_values: result.p_values.clone(),
vif: result.vif.clone(),
f_p_value: result.f_p_value,
})
}
}
#[derive(Debug, Clone)]
pub struct VifAnalysis {
pub vif_per_column: Vec<f64>,
pub high_vif_columns: Vec<u32>,
pub threshold: f64,
pub names: Vec<String>,
}
pub fn vif_analysis(
columns: &[Vec<f64>],
names: &[String],
threshold: f64,
) -> Result<VifAnalysis, InsightError> {
validate_clean_data(columns, names)?;
let refs: Vec<&[f64]> = columns.iter().map(|c| c.as_slice()).collect();
let v = u_analytics::regression::vif(&refs).ok_or_else(|| InsightError::DegenerateData {
reason:
"VIF computation failed (insufficient data, length mismatch, or near-zero variance)"
.into(),
})?;
let high_vif_columns: Vec<u32> = v
.iter()
.enumerate()
.filter(|(_, &r)| r > threshold)
.map(|(i, _)| i as u32)
.collect();
Ok(VifAnalysis {
vif_per_column: v,
high_vif_columns,
threshold,
names: names.to_vec(),
})
}
pub fn condition_number(columns: &[Vec<f64>], names: &[String]) -> Result<f64, InsightError> {
validate_clean_data(columns, names)?;
let refs: Vec<&[f64]> = columns.iter().map(|c| c.as_slice()).collect();
u_analytics::regression::condition_number(&refs).ok_or_else(|| InsightError::DegenerateData {
reason:
"condition number computation failed (insufficient data or eigendecomposition failed)"
.into(),
})
}
#[derive(Debug, Clone)]
pub struct CramersVResult {
pub v: f64,
pub chi_squared: f64,
pub p_value: f64,
pub n_rows: usize,
pub n_cols: usize,
}
pub fn cramers_v(table: &[f64], n_rows: usize, n_cols: usize) -> Option<CramersVResult> {
let test = u_analytics::testing::chi_squared_independence(table, n_rows, n_cols)?;
let n: f64 = table.iter().sum();
if n <= 0.0 {
return None;
}
let k = n_rows.min(n_cols);
if k < 2 {
return None;
}
let denom = n * (k - 1) as f64;
let v = if denom > 0.0 {
(test.statistic / denom).sqrt()
} else {
0.0
};
Some(CramersVResult {
v,
chi_squared: test.statistic,
p_value: test.p_value,
n_rows,
n_cols,
})
}
#[derive(Debug, Clone)]
pub struct AnovaFeatureResult {
pub name: String,
pub f_statistic: f64,
pub p_value: f64,
}
#[derive(Debug, Clone)]
pub struct AnovaSelectionResult {
pub features: Vec<AnovaFeatureResult>,
pub selected_indices: Vec<usize>,
}
pub fn anova_feature_selection(
features: &[Vec<f64>],
feature_names: &[String],
target: &[usize],
significance_level: f64,
) -> Result<AnovaSelectionResult, InsightError> {
if features.is_empty() {
return Err(InsightError::InvalidParameter {
name: "features".into(),
message: "at least 1 feature required".into(),
});
}
let n = features[0].len();
if n < 4 {
return Err(InsightError::InsufficientData {
min_required: 4,
actual: n,
});
}
if target.len() != n {
return Err(InsightError::DimensionMismatch {
expected: n,
actual: target.len(),
});
}
let mut classes: Vec<usize> = target.to_vec();
classes.sort_unstable();
classes.dedup();
if classes.len() < 2 {
return Err(InsightError::DegenerateData {
reason: format!("ANOVA requires at least 2 classes, got {}", classes.len()),
});
}
let mut results: Vec<(usize, AnovaFeatureResult)> = Vec::with_capacity(features.len());
for (i, feature) in features.iter().enumerate() {
if feature.len() != n {
return Err(InsightError::DimensionMismatch {
expected: n,
actual: feature.len(),
});
}
let nan_count = feature.iter().filter(|v| v.is_nan()).count();
if nan_count > 0 {
let name = feature_names
.get(i)
.cloned()
.unwrap_or_else(|| format!("feature_{i}"));
return Err(InsightError::MissingValues {
column: name,
count: nan_count,
});
}
let groups: Vec<Vec<f64>> = classes
.iter()
.map(|&cls| {
feature
.iter()
.zip(target.iter())
.filter(|(_, &t)| t == cls)
.map(|(&v, _)| v)
.collect()
})
.collect();
let group_refs: Vec<&[f64]> = groups.iter().map(|g| g.as_slice()).collect();
let name = feature_names
.get(i)
.cloned()
.unwrap_or_else(|| format!("feature_{i}"));
match u_analytics::testing::one_way_anova(&group_refs) {
Some(anova) => {
results.push((
i,
AnovaFeatureResult {
name,
f_statistic: anova.f_statistic,
p_value: anova.p_value,
},
));
}
None => {
results.push((
i,
AnovaFeatureResult {
name,
f_statistic: 0.0,
p_value: 1.0,
},
));
}
}
}
results.sort_by(|a, b| {
a.1.p_value
.partial_cmp(&b.1.p_value)
.unwrap_or(std::cmp::Ordering::Equal)
});
let selected_indices: Vec<usize> = results
.iter()
.filter(|(_, r)| r.p_value <= significance_level)
.map(|(idx, _)| *idx)
.collect();
let features_sorted = results.into_iter().map(|(_, r)| r).collect();
Ok(AnovaSelectionResult {
features: features_sorted,
selected_indices,
})
}
#[derive(Debug, Clone)]
pub struct MutualInfoFeature {
pub name: String,
pub index: usize,
pub mi: f64,
}
#[derive(Debug, Clone)]
pub struct MutualInfoResult {
pub features: Vec<MutualInfoFeature>,
}
pub fn mutual_info_classif(
features: &[Vec<f64>],
feature_names: &[String],
target: &[usize],
n_bins: Option<usize>,
) -> Result<MutualInfoResult, InsightError> {
if features.is_empty() {
return Err(InsightError::InvalidParameter {
name: "features".into(),
message: "at least 1 feature required".into(),
});
}
let n = features[0].len();
if n < 4 {
return Err(InsightError::InsufficientData {
min_required: 4,
actual: n,
});
}
if target.len() != n {
return Err(InsightError::DimensionMismatch {
expected: n,
actual: target.len(),
});
}
let bins = n_bins.unwrap_or_else(|| {
let k = ((n as f64).ln() / std::f64::consts::LN_2 + 1.0).ceil() as usize;
k.max(2)
});
let mut classes: Vec<usize> = target.to_vec();
classes.sort_unstable();
classes.dedup();
let n_classes = classes.len();
if n_classes < 2 {
return Err(InsightError::DegenerateData {
reason: format!("mutual information requires at least 2 classes, got {n_classes}"),
});
}
let class_map: std::collections::HashMap<usize, usize> =
classes.iter().enumerate().map(|(i, &c)| (c, i)).collect();
let mut class_counts = vec![0usize; n_classes];
for &t in target {
let ci = class_map[&t];
class_counts[ci] += 1;
}
let n_f = n as f64;
let mut results: Vec<MutualInfoFeature> = Vec::with_capacity(features.len());
for (fi, feature) in features.iter().enumerate() {
if feature.len() != n {
return Err(InsightError::DimensionMismatch {
expected: n,
actual: feature.len(),
});
}
let nan_count = feature.iter().filter(|v| v.is_nan()).count();
if nan_count > 0 {
let name = feature_names
.get(fi)
.cloned()
.unwrap_or_else(|| format!("feature_{fi}"));
return Err(InsightError::MissingValues {
column: name,
count: nan_count,
});
}
let mut indexed: Vec<(usize, f64)> = feature.iter().copied().enumerate().collect();
indexed.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
let mut bin_labels = vec![0usize; n];
for (rank, &(orig_idx, _)) in indexed.iter().enumerate() {
bin_labels[orig_idx] = (rank * bins / n).min(bins - 1);
}
let mut joint = vec![0usize; bins * n_classes];
let mut bin_counts = vec![0usize; bins];
for i in 0..n {
let b = bin_labels[i];
let c = class_map[&target[i]];
joint[b * n_classes + c] += 1;
bin_counts[b] += 1;
}
let mut mi = 0.0;
for b in 0..bins {
for c in 0..n_classes {
let joint_count = joint[b * n_classes + c];
if joint_count == 0 {
continue;
}
let p_xy = joint_count as f64 / n_f;
let p_x = bin_counts[b] as f64 / n_f;
let p_y = class_counts[c] as f64 / n_f;
mi += p_xy * (p_xy / (p_x * p_y)).ln();
}
}
mi = mi.max(0.0);
let name = feature_names
.get(fi)
.cloned()
.unwrap_or_else(|| format!("feature_{fi}"));
results.push(MutualInfoFeature {
name,
index: fi,
mi,
});
}
results.sort_by(|a, b| b.mi.partial_cmp(&a.mi).unwrap_or(std::cmp::Ordering::Equal));
Ok(MutualInfoResult { features: results })
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn validate_clean_ok() {
let data = vec![vec![1.0, 2.0, 3.0], vec![4.0, 5.0, 6.0]];
let names = vec!["a".into(), "b".into()];
assert!(validate_clean_data(&data, &names).is_ok());
}
#[test]
fn validate_nan_rejected() {
let data = vec![vec![1.0, f64::NAN, 3.0]];
let names = vec!["a".into()];
let err = validate_clean_data(&data, &names).unwrap_err();
match err {
InsightError::MissingValues { column, count } => {
assert_eq!(column, "a");
assert_eq!(count, 1);
}
_ => panic!("expected MissingValues error"),
}
}
#[test]
fn validate_infinity_rejected() {
let data = vec![vec![1.0, f64::INFINITY, 3.0]];
let names = vec!["a".into()];
let err = validate_clean_data(&data, &names).unwrap_err();
match err {
InsightError::DegenerateData { reason } => {
assert!(reason.contains("non-finite") || reason.contains("infinite"));
}
other => panic!("expected DegenerateData, got {other:?}"),
}
}
#[test]
fn validate_nan_returns_missing_values() {
let data = vec![vec![1.0, 2.0, f64::NAN]];
let names = vec!["a".into()];
let err = validate_clean_data(&data, &names).unwrap_err();
assert!(matches!(err, InsightError::MissingValues { .. }));
}
#[test]
fn validate_empty_rejected() {
let data: Vec<Vec<f64>> = vec![];
let names: Vec<String> = vec![];
assert!(validate_clean_data(&data, &names).is_err());
}
#[test]
fn validate_single_row_rejected() {
let data = vec![vec![1.0]];
let names = vec!["a".into()];
assert!(validate_clean_data(&data, &names).is_err());
}
#[test]
fn validate_dimension_mismatch() {
let data = vec![vec![1.0, 2.0, 3.0], vec![4.0, 5.0]];
let names = vec!["a".into(), "b".into()];
let err = validate_clean_data(&data, &names).unwrap_err();
assert!(matches!(err, InsightError::DimensionMismatch { .. }));
}
#[test]
fn pearson_correlation_matrix() {
let data = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0],
vec![2.0, 4.0, 6.0, 8.0, 10.0], vec![5.0, 4.0, 3.0, 2.0, 1.0], ];
let names = vec!["x".into(), "y".into(), "z".into()];
let config = CorrelationConfig::default();
let result = correlation_analysis(&data, &names, &config).unwrap();
assert_eq!(result.matrix.rows(), 3);
assert!((result.matrix.get(0, 0) - 1.0).abs() < 1e-10);
assert!((result.matrix.get(0, 1) - 1.0).abs() < 1e-10);
assert!((result.matrix.get(0, 2) + 1.0).abs() < 1e-10);
assert!(!result.high_pairs.is_empty());
}
#[test]
fn spearman_correlation() {
let data = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0],
vec![1.0, 4.0, 9.0, 16.0, 25.0], ];
let names = vec!["x".into(), "y".into()];
let config = CorrelationConfig {
method: CorrelationMethod::Spearman,
high_threshold: 0.7,
};
let result = correlation_analysis(&data, &names, &config).unwrap();
assert!((result.matrix.get(0, 1) - 1.0).abs() < 1e-10);
}
#[test]
fn kendall_correlation() {
let data = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0],
vec![1.0, 2.0, 3.0, 4.0, 5.0], vec![5.0, 4.0, 3.0, 2.0, 1.0], ];
let names = vec!["a".into(), "b".into(), "c".into()];
let config = CorrelationConfig {
method: CorrelationMethod::Kendall,
high_threshold: 0.7,
};
let result = correlation_analysis(&data, &names, &config).unwrap();
assert_eq!(result.method, CorrelationMethod::Kendall);
assert!((result.matrix.get(0, 1) - 1.0).abs() < 1e-10);
assert!((result.matrix.get(0, 2) + 1.0).abs() < 1e-10);
assert!(result.high_pairs.len() >= 2);
}
#[test]
fn kendall_correlation_with_ties() {
let data = vec![
vec![1.0, 2.0, 2.0, 3.0, 4.0, 5.0],
vec![1.0, 2.0, 3.0, 3.0, 4.0, 5.0],
];
let names = vec!["a".into(), "b".into()];
let config = CorrelationConfig {
method: CorrelationMethod::Kendall,
high_threshold: 0.5,
};
let result = correlation_analysis(&data, &names, &config).unwrap();
assert!(result.matrix.get(0, 1) > 0.7);
}
#[test]
fn vif_analysis_collinear_flags() {
let x1: Vec<f64> = (0..20).map(|i| i as f64).collect();
let x2: Vec<f64> = x1.iter().map(|&v| v * 2.0 + 1e-3).collect();
let cols = vec![x1, x2];
let names = vec!["x1".into(), "x2".into()];
let r = vif_analysis(&cols, &names, 10.0).unwrap();
assert!(r.vif_per_column[0] > 10.0);
assert!(r.high_vif_columns.contains(&0));
assert!(r.high_vif_columns.contains(&1));
assert_eq!(r.threshold, 10.0);
}
#[test]
fn vif_analysis_independent_no_flag() {
let x1: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
let x2: Vec<f64> = vec![5.0, 9.0, 1.0, 7.0, 3.0, 8.0, 2.0, 6.0, 4.0, 10.0];
let cols = vec![x1, x2];
let names = vec!["x1".into(), "x2".into()];
let r = vif_analysis(&cols, &names, 10.0).unwrap();
assert!(r.high_vif_columns.is_empty());
}
#[test]
fn condition_number_orthogonal_small() {
let x1 = vec![1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0];
let x2 = vec![1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0];
let cols = vec![x1, x2];
let names = vec!["x1".into(), "x2".into()];
let c = condition_number(&cols, &names).unwrap();
assert!(c < 5.0);
}
#[test]
fn condition_number_collinear_huge() {
let x1: Vec<f64> = (0..20).map(|i| i as f64).collect();
let x2: Vec<f64> = x1.iter().map(|&v| v * 2.0 + 1e-6).collect();
let cols = vec![x1, x2];
let names = vec!["x1".into(), "x2".into()];
let c = condition_number(&cols, &names).unwrap();
assert!(c > 1e5 || c.is_infinite());
}
#[test]
fn vif_analysis_rejects_nan() {
let x1 = vec![1.0, 2.0, f64::NAN, 4.0, 5.0, 6.0];
let x2 = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let cols = vec![x1, x2];
let names = vec!["x1".into(), "x2".into()];
assert!(vif_analysis(&cols, &names, 10.0).is_err());
}
#[test]
fn high_correlation_filtering() {
let data = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0],
vec![2.0, 4.0, 6.0, 8.0, 10.0], vec![3.1, 2.9, 3.0, 3.2, 2.8], ];
let names = vec!["x".into(), "y".into(), "noise".into()];
let config = CorrelationConfig {
method: CorrelationMethod::Pearson,
high_threshold: 0.9,
};
let result = correlation_analysis(&data, &names, &config).unwrap();
assert_eq!(result.high_pairs.len(), 1);
assert_eq!(result.high_pairs[0].col_a, "x");
assert_eq!(result.high_pairs[0].col_b, "y");
}
#[test]
fn correlation_rejects_nan() {
let data = vec![vec![1.0, f64::NAN, 3.0], vec![4.0, 5.0, 6.0]];
let names = vec!["a".into(), "b".into()];
let config = CorrelationConfig::default();
assert!(correlation_analysis(&data, &names, &config).is_err());
}
#[test]
fn simple_regression() {
let x = vec![vec![1.0, 2.0, 3.0, 4.0, 5.0]];
let y = vec![2.1, 3.9, 6.1, 7.9, 10.1];
let result = regression_analysis(&x, &["x".into()], &y, "y").unwrap();
assert!(result.r_squared > 0.99);
assert_eq!(result.coefficients.len(), 2); assert!(result.vif.is_empty()); }
#[test]
fn multiple_regression() {
let x1: Vec<f64> = (1..=20).map(|i| i as f64).collect();
let x2: Vec<f64> = (1..=20)
.map(|i| (i as f64) * 0.5 + 3.0 + ((i * 7 % 11) as f64) * 0.3)
.collect();
let y: Vec<f64> = x1
.iter()
.zip(x2.iter())
.map(|(&a, &b)| 2.0 * a + 3.0 * b + 1.0)
.collect();
let predictors = vec![x1, x2];
let result =
regression_analysis(&predictors, &["x1".into(), "x2".into()], &y, "y").unwrap();
assert!(result.r_squared > 0.99);
assert_eq!(result.coefficients.len(), 3); assert_eq!(result.vif.len(), 2); }
#[test]
fn regression_rejects_nan_target() {
let x = vec![vec![1.0, 2.0, 3.0]];
let y = vec![1.0, f64::NAN, 3.0];
let err = regression_analysis(&x, &["x".into()], &y, "y").unwrap_err();
assert!(matches!(err, InsightError::MissingValues { .. }));
}
#[test]
fn regression_rejects_nan_predictor() {
let x = vec![vec![1.0, f64::NAN, 3.0]];
let y = vec![1.0, 2.0, 3.0];
let err = regression_analysis(&x, &["x".into()], &y, "y").unwrap_err();
assert!(matches!(err, InsightError::MissingValues { .. }));
}
#[test]
fn regression_insufficient_data() {
let x = vec![vec![1.0, 2.0]];
let y = vec![1.0, 2.0];
assert!(regression_analysis(&x, &["x".into()], &y, "y").is_err());
}
#[test]
fn cramers_v_perfect_association() {
let table = [50.0, 0.0, 0.0, 50.0];
let result = cramers_v(&table, 2, 2).unwrap();
assert!(result.v > 0.9, "V should be near 1.0: {}", result.v);
}
#[test]
fn cramers_v_no_association() {
let table = [25.0, 25.0, 25.0, 25.0];
let result = cramers_v(&table, 2, 2).unwrap();
assert!(result.v < 0.05, "V should be near 0: {}", result.v);
}
#[test]
fn cramers_v_3x3() {
let table = [30.0, 1.0, 1.0, 1.0, 30.0, 1.0, 1.0, 1.0, 30.0];
let result = cramers_v(&table, 3, 3).unwrap();
assert!(result.v > 0.7, "V should be high: {}", result.v);
}
#[test]
fn cramers_v_range() {
let table = [10.0, 20.0, 30.0, 15.0, 25.0, 5.0];
let result = cramers_v(&table, 2, 3).unwrap();
assert!(result.v >= 0.0 && result.v <= 1.0);
assert!(result.p_value >= 0.0 && result.p_value <= 1.0);
}
#[test]
fn cramers_v_invalid() {
assert!(cramers_v(&[10.0, 20.0], 1, 2).is_none()); assert!(cramers_v(&[10.0], 2, 2).is_none()); }
#[test]
fn anova_separating_feature() {
let features = vec![
vec![1.0, 1.1, 1.2, 5.0, 5.1, 5.2], vec![3.0, 3.1, 2.9, 3.0, 3.1, 2.9], ];
let names = vec!["A".into(), "B".into()];
let target = vec![0, 0, 0, 1, 1, 1];
let result = anova_feature_selection(&features, &names, &target, 0.05).unwrap();
assert_eq!(result.features.len(), 2);
assert_eq!(result.features[0].name, "A");
assert!(result.features[0].p_value < 0.01);
assert!(result.selected_indices.contains(&0));
}
#[test]
fn anova_no_significant() {
let features = vec![
vec![3.0, 3.1, 2.9, 3.0, 3.1, 2.9],
vec![5.0, 5.1, 4.9, 5.0, 5.1, 4.9],
];
let names = vec!["X".into(), "Y".into()];
let target = vec![0, 0, 0, 1, 1, 1];
let result = anova_feature_selection(&features, &names, &target, 0.01).unwrap();
assert_eq!(result.features.len(), 2);
}
#[test]
fn anova_multiple_classes() {
let features = vec![vec![1.0, 1.1, 1.2, 5.0, 5.1, 5.2, 10.0, 10.1, 10.2]];
let names = vec!["f0".into()];
let target = vec![0, 0, 0, 1, 1, 1, 2, 2, 2];
let result = anova_feature_selection(&features, &names, &target, 0.05).unwrap();
assert_eq!(result.features.len(), 1);
assert!(result.features[0].p_value < 0.01);
assert_eq!(result.selected_indices.len(), 1);
}
#[test]
fn anova_rejects_nan() {
let features = vec![vec![1.0, f64::NAN, 3.0, 4.0]];
let names = vec!["f0".into()];
let target = vec![0, 0, 1, 1];
assert!(anova_feature_selection(&features, &names, &target, 0.05).is_err());
}
#[test]
fn anova_rejects_single_class() {
let features = vec![vec![1.0, 2.0, 3.0, 4.0]];
let names = vec!["f0".into()];
let target = vec![0, 0, 0, 0]; assert!(anova_feature_selection(&features, &names, &target, 0.05).is_err());
}
#[test]
fn anova_rejects_dimension_mismatch() {
let features = vec![vec![1.0, 2.0, 3.0, 4.0]];
let names = vec!["f0".into()];
let target = vec![0, 0, 1]; assert!(anova_feature_selection(&features, &names, &target, 0.05).is_err());
}
#[test]
fn mi_separating_vs_noise() {
let features = vec![
vec![1.0, 1.1, 1.2, 5.0, 5.1, 5.2],
vec![3.0, 3.1, 2.9, 3.0, 3.1, 2.9],
];
let names = vec!["A".into(), "B".into()];
let target = vec![0, 0, 0, 1, 1, 1];
let result = mutual_info_classif(&features, &names, &target, None).unwrap();
assert_eq!(result.features.len(), 2);
assert!(
result.features[0].mi > result.features[1].mi,
"A MI={} should > B MI={}",
result.features[0].mi,
result.features[1].mi
);
}
#[test]
fn mi_perfect_dependence() {
let features = vec![vec![0.0, 0.1, 0.2, 5.0, 5.1, 5.2, 10.0, 10.1, 10.2]];
let names = vec!["f0".into()];
let target = vec![0, 0, 0, 1, 1, 1, 2, 2, 2];
let result = mutual_info_classif(&features, &names, &target, None).unwrap();
assert!(result.features[0].mi > 0.5, "MI={}", result.features[0].mi);
}
#[test]
fn mi_nonnegative() {
let features = vec![
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
vec![5.0, 3.0, 7.0, 1.0, 8.0, 2.0, 6.0, 4.0],
];
let names = vec!["a".into(), "b".into()];
let target = vec![0, 0, 0, 0, 1, 1, 1, 1];
let result = mutual_info_classif(&features, &names, &target, None).unwrap();
for f in &result.features {
assert!(f.mi >= 0.0, "{} MI={}", f.name, f.mi);
}
}
#[test]
fn mi_custom_bins() {
let features = vec![vec![1.0, 1.1, 5.0, 5.1, 10.0, 10.1]];
let names = vec!["f0".into()];
let target = vec![0, 0, 1, 1, 2, 2];
let result = mutual_info_classif(&features, &names, &target, Some(3)).unwrap();
assert!(result.features[0].mi > 0.0);
}
#[test]
fn mi_rejects_nan() {
let features = vec![vec![1.0, f64::NAN, 3.0, 4.0]];
let names = vec!["f0".into()];
let target = vec![0, 0, 1, 1];
assert!(mutual_info_classif(&features, &names, &target, None).is_err());
}
#[test]
fn mi_rejects_single_class() {
let features = vec![vec![1.0, 2.0, 3.0, 4.0]];
let names = vec!["f0".into()];
let target = vec![0, 0, 0, 0];
assert!(mutual_info_classif(&features, &names, &target, None).is_err());
}
}