use crate::error::InsightError;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BinMethod {
Sturges,
Scott,
FreedmanDiaconis,
}
#[derive(Debug, Clone)]
pub struct DistributionConfig {
pub bin_method: BinMethod,
pub significance_level: f64,
pub compute_ecdf: bool,
pub compute_histogram: bool,
pub compute_qq_plot: bool,
pub fit_distributions: bool,
}
impl Default for DistributionConfig {
fn default() -> Self {
Self {
bin_method: BinMethod::FreedmanDiaconis,
significance_level: 0.05,
compute_ecdf: true,
compute_histogram: true,
compute_qq_plot: true,
fit_distributions: false,
}
}
}
#[derive(Debug, Clone)]
pub struct EcdfResult {
pub values: Vec<f64>,
pub probabilities: Vec<f64>,
}
#[derive(Debug, Clone)]
pub struct HistogramResult {
pub n_bins: usize,
pub bin_width: f64,
pub edges: Vec<f64>,
pub counts: Vec<usize>,
pub method: BinMethod,
}
#[derive(Debug, Clone)]
pub struct QQPlotResult {
pub theoretical: Vec<f64>,
pub sample: Vec<f64>,
}
#[derive(Debug, Clone, Copy)]
pub struct NormalityTestResult {
pub statistic: f64,
pub p_value: f64,
pub rejected: bool,
}
#[derive(Debug, Clone)]
pub struct NormalityAssessment {
pub ks_test: Option<NormalityTestResult>,
pub jarque_bera: Option<NormalityTestResult>,
pub shapiro_wilk: Option<NormalityTestResult>,
pub anderson_darling: Option<NormalityTestResult>,
pub is_normal: bool,
pub significance_level: f64,
}
#[derive(Debug, Clone)]
pub struct FitResult {
pub distribution: String,
pub parameters: Vec<(String, f64)>,
pub log_likelihood: f64,
pub aic: f64,
pub bic: f64,
pub n_params: usize,
}
#[derive(Debug, Clone)]
pub struct DistributionAnalysis {
pub n: usize,
pub ecdf: Option<EcdfResult>,
pub histogram: Option<HistogramResult>,
pub qq_plot: Option<QQPlotResult>,
pub normality: NormalityAssessment,
pub fits: Vec<FitResult>,
}
pub fn distribution_analysis(
data: &[f64],
config: &DistributionConfig,
) -> Result<DistributionAnalysis, InsightError> {
let n = data.len();
if n < 2 {
return Err(InsightError::InsufficientData {
min_required: 2,
actual: n,
});
}
let nan_count = data.iter().filter(|v| v.is_nan()).count();
if nan_count > 0 {
return Err(InsightError::MissingValues {
column: "data".to_string(),
count: nan_count,
});
}
let inf_count = data.iter().filter(|v| v.is_infinite()).count();
if inf_count > 0 {
return Err(InsightError::DegenerateData {
reason: format!("input contains {inf_count} non-finite (infinite) value(s)"),
});
}
let ecdf = if config.compute_ecdf {
u_analytics::distribution::ecdf(data).map(|(values, probabilities)| EcdfResult {
values,
probabilities,
})
} else {
None
};
let histogram = if config.compute_histogram {
let bin_method = match config.bin_method {
BinMethod::Sturges => u_analytics::distribution::BinMethod::Sturges,
BinMethod::Scott => u_analytics::distribution::BinMethod::Scott,
BinMethod::FreedmanDiaconis => u_analytics::distribution::BinMethod::FreedmanDiaconis,
};
u_analytics::distribution::histogram_bins(data, bin_method).map(|bins| HistogramResult {
n_bins: bins.n_bins,
bin_width: bins.bin_width,
edges: bins.edges,
counts: bins.counts,
method: config.bin_method,
})
} else {
None
};
let qq_plot = if config.compute_qq_plot {
u_analytics::distribution::qq_plot_normal(data).map(|(theoretical, sample)| QQPlotResult {
theoretical,
sample,
})
} else {
None
};
let alpha = config.significance_level;
let ks_test = u_analytics::distribution::ks_test_normal(data).map(|(statistic, p_value)| {
NormalityTestResult {
statistic,
p_value,
rejected: p_value < alpha,
}
});
let jarque_bera = u_analytics::testing::jarque_bera_test(data).map(|r| NormalityTestResult {
statistic: r.statistic,
p_value: r.p_value,
rejected: r.p_value < alpha,
});
let shapiro_wilk = u_analytics::testing::shapiro_wilk_test(data).map(|r| NormalityTestResult {
statistic: r.w,
p_value: r.p_value,
rejected: r.p_value < alpha,
});
let anderson_darling =
u_analytics::testing::anderson_darling_test(data).map(|r| NormalityTestResult {
statistic: r.statistic_star,
p_value: r.p_value,
rejected: r.p_value < alpha,
});
let tests: [Option<NormalityTestResult>; 4] =
[ks_test, jarque_bera, shapiro_wilk, anderson_darling];
let any_rejected = tests.iter().any(|t| t.is_some_and(|r| r.rejected));
let any_available = tests.iter().any(|t| t.is_some());
let is_normal = any_available && !any_rejected;
let normality = NormalityAssessment {
ks_test,
jarque_bera,
shapiro_wilk,
anderson_darling,
is_normal,
significance_level: alpha,
};
let fits = if config.fit_distributions {
u_analytics::distribution::fit_best(data)
.into_iter()
.map(|r| FitResult {
distribution: r.distribution,
parameters: r.parameters,
log_likelihood: r.log_likelihood,
aic: r.aic,
bic: r.bic,
n_params: r.n_params,
})
.collect()
} else {
Vec::new()
};
Ok(DistributionAnalysis {
n,
ecdf,
histogram,
qq_plot,
normality,
fits,
})
}
pub fn ecdf(data: &[f64]) -> Option<EcdfResult> {
u_analytics::distribution::ecdf(data).map(|(values, probabilities)| EcdfResult {
values,
probabilities,
})
}
pub fn histogram(data: &[f64], method: BinMethod) -> Option<HistogramResult> {
let bin_method = match method {
BinMethod::Sturges => u_analytics::distribution::BinMethod::Sturges,
BinMethod::Scott => u_analytics::distribution::BinMethod::Scott,
BinMethod::FreedmanDiaconis => u_analytics::distribution::BinMethod::FreedmanDiaconis,
};
u_analytics::distribution::histogram_bins(data, bin_method).map(|bins| HistogramResult {
n_bins: bins.n_bins,
bin_width: bins.bin_width,
edges: bins.edges,
counts: bins.counts,
method,
})
}
pub fn qq_plot(data: &[f64]) -> Option<QQPlotResult> {
u_analytics::distribution::qq_plot_normal(data).map(|(theoretical, sample)| QQPlotResult {
theoretical,
sample,
})
}
pub fn ks_test(data: &[f64]) -> Option<(f64, f64)> {
u_analytics::distribution::ks_test_normal(data)
}
pub fn jarque_bera(data: &[f64]) -> Option<NormalityTestResult> {
u_analytics::testing::jarque_bera_test(data).map(|r| NormalityTestResult {
statistic: r.statistic,
p_value: r.p_value,
rejected: r.p_value < 0.05,
})
}
pub fn fit_distributions(data: &[f64]) -> Vec<FitResult> {
u_analytics::distribution::fit_best(data)
.into_iter()
.map(|r| FitResult {
distribution: r.distribution,
parameters: r.parameters,
log_likelihood: r.log_likelihood,
aic: r.aic,
bic: r.bic,
n_params: r.n_params,
})
.collect()
}
pub fn shapiro_wilk(data: &[f64]) -> Option<NormalityTestResult> {
u_analytics::testing::shapiro_wilk_test(data).map(|r| NormalityTestResult {
statistic: r.w,
p_value: r.p_value,
rejected: r.p_value < 0.05,
})
}
pub fn anderson_darling(data: &[f64]) -> Option<NormalityTestResult> {
u_analytics::testing::anderson_darling_test(data).map(|r| NormalityTestResult {
statistic: r.statistic_star,
p_value: r.p_value,
rejected: r.p_value < 0.05,
})
}
#[derive(Debug, Clone, Copy)]
pub struct GrubbsResult {
pub statistic: f64,
pub critical_value: f64,
pub outlier_index: usize,
pub outlier_value: f64,
pub is_outlier: bool,
}
pub fn grubbs_test(data: &[f64], alpha: f64) -> Option<GrubbsResult> {
let n = data.len();
if n < 3 {
return None;
}
if data.iter().any(|v| !v.is_finite()) {
return None;
}
let mean: f64 = data.iter().sum::<f64>() / n as f64;
let variance: f64 = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1) as f64;
let std_dev = variance.sqrt();
if std_dev < 1e-15 {
return None; }
let (outlier_index, outlier_value) = data.iter().enumerate().max_by(|a, b| {
(a.1 - mean)
.abs()
.partial_cmp(&(b.1 - mean).abs())
.unwrap_or(std::cmp::Ordering::Equal)
})?;
let statistic = (outlier_value - mean).abs() / std_dev;
let alpha_adj = alpha / (2.0 * n as f64);
let df = (n - 2) as f64;
let t_val = u_numflow::special::t_distribution_quantile(1.0 - alpha_adj, df);
let t2 = t_val * t_val;
let critical_value = ((n - 1) as f64 / (n as f64).sqrt()) * (t2 / (df + t2)).sqrt();
Some(GrubbsResult {
statistic,
critical_value,
outlier_index,
outlier_value: *outlier_value,
is_outlier: statistic > critical_value,
})
}
#[cfg(test)]
mod tests {
use super::*;
fn nearly_normal_data() -> Vec<f64> {
vec![
-2.5, -2.0, -1.8, -1.5, -1.2, -1.0, -0.8, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.8, 1.0,
1.2, 1.5, 1.8, 2.0, 2.5,
]
}
fn uniform_data() -> Vec<f64> {
(0..50).map(|i| i as f64 * 2.0).collect()
}
#[test]
fn full_analysis_normal_data() {
let data = nearly_normal_data();
let config = DistributionConfig::default();
let result = distribution_analysis(&data, &config).unwrap();
assert_eq!(result.n, 20);
assert!(result.ecdf.is_some());
assert!(result.histogram.is_some());
assert!(result.qq_plot.is_some());
assert!(result.normality.ks_test.is_some());
assert!(result.normality.jarque_bera.is_some());
assert!(result.normality.shapiro_wilk.is_some());
assert!(result.normality.anderson_darling.is_some());
let ks = result.normality.ks_test.unwrap();
assert!(ks.p_value > 0.05, "KS p = {}", ks.p_value);
assert!(!ks.rejected);
assert!(result.normality.is_normal);
let sw = result.normality.shapiro_wilk.unwrap();
assert!(sw.p_value > 0.05, "SW p = {}", sw.p_value);
let ad = result.normality.anderson_darling.unwrap();
assert!(ad.p_value > 0.05, "AD p = {}", ad.p_value);
}
#[test]
fn full_analysis_uniform_data() {
let data = uniform_data();
let config = DistributionConfig::default();
let result = distribution_analysis(&data, &config).unwrap();
assert_eq!(result.n, 50);
assert!(result.histogram.is_some());
let hist = result.histogram.unwrap();
assert_eq!(hist.counts.iter().sum::<usize>(), 50);
}
#[test]
fn full_analysis_rejects_nan() {
let data = vec![1.0, f64::NAN, 3.0];
let config = DistributionConfig::default();
let err = distribution_analysis(&data, &config).unwrap_err();
assert!(matches!(err, InsightError::MissingValues { .. }));
}
#[test]
fn full_analysis_rejects_infinity() {
let data = vec![1.0, f64::INFINITY, 3.0];
let config = DistributionConfig::default();
let err = distribution_analysis(&data, &config).unwrap_err();
match err {
InsightError::DegenerateData { reason } => {
assert!(reason.contains("non-finite") || reason.contains("infinite"));
}
other => panic!("expected DegenerateData, got {other:?}"),
}
}
#[test]
fn full_analysis_rejects_insufficient() {
let data = vec![1.0];
let config = DistributionConfig::default();
let err = distribution_analysis(&data, &config).unwrap_err();
assert!(matches!(err, InsightError::InsufficientData { .. }));
}
#[test]
fn config_skip_components() {
let data = nearly_normal_data();
let config = DistributionConfig {
compute_ecdf: false,
compute_histogram: false,
compute_qq_plot: false,
..Default::default()
};
let result = distribution_analysis(&data, &config).unwrap();
assert!(result.ecdf.is_none());
assert!(result.histogram.is_none());
assert!(result.qq_plot.is_none());
assert!(result.normality.ks_test.is_some());
}
#[test]
fn ecdf_basic() {
let data = [3.0, 1.0, 2.0, 1.0, 4.0];
let result = ecdf(&data).unwrap();
assert_eq!(result.values, vec![1.0, 2.0, 3.0, 4.0]);
assert!((result.probabilities[0] - 0.4).abs() < 1e-10);
assert!((result.probabilities[3] - 1.0).abs() < 1e-10);
}
#[test]
fn ecdf_empty() {
assert!(ecdf(&[]).is_none());
}
#[test]
fn ecdf_nan() {
assert!(ecdf(&[1.0, f64::NAN]).is_none());
}
#[test]
fn histogram_sturges() {
let data: Vec<f64> = (0..100).map(|i| i as f64).collect();
let result = histogram(&data, BinMethod::Sturges).unwrap();
assert!(result.n_bins >= 5);
assert_eq!(result.edges.len(), result.n_bins + 1);
assert_eq!(result.counts.iter().sum::<usize>(), 100);
assert_eq!(result.method, BinMethod::Sturges);
}
#[test]
fn histogram_scott() {
let data: Vec<f64> = (0..100).map(|i| i as f64).collect();
let result = histogram(&data, BinMethod::Scott).unwrap();
assert!(result.n_bins >= 3);
assert_eq!(result.counts.iter().sum::<usize>(), 100);
}
#[test]
fn histogram_fd() {
let data: Vec<f64> = (0..100).map(|i| i as f64).collect();
let result = histogram(&data, BinMethod::FreedmanDiaconis).unwrap();
assert!(result.n_bins >= 3);
assert_eq!(result.counts.iter().sum::<usize>(), 100);
}
#[test]
fn histogram_insufficient() {
assert!(histogram(&[1.0], BinMethod::Sturges).is_none());
}
#[test]
fn histogram_constant() {
assert!(histogram(&[5.0, 5.0, 5.0], BinMethod::Sturges).is_none());
}
#[test]
fn qq_plot_basic() {
let data = [-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5];
let result = qq_plot(&data).unwrap();
assert_eq!(result.theoretical.len(), 7);
assert_eq!(result.sample.len(), 7);
assert!((result.sample[0] + 1.5).abs() < 1e-10);
assert!((result.sample[6] - 1.5).abs() < 1e-10);
assert!(result.theoretical[3].abs() < 0.1);
}
#[test]
fn qq_plot_insufficient() {
assert!(qq_plot(&[1.0, 2.0]).is_none());
}
#[test]
fn ks_test_normal_data() {
let data = nearly_normal_data();
let (d, p) = ks_test(&data).unwrap();
assert!(d > 0.0 && d < 1.0);
assert!(p > 0.05, "p = {p}");
}
#[test]
fn ks_test_insufficient() {
assert!(ks_test(&[1.0, 2.0, 3.0, 4.0]).is_none());
}
#[test]
fn jarque_bera_normal_data() {
let data = nearly_normal_data();
let result = jarque_bera(&data).unwrap();
assert!(result.p_value > 0.05, "p = {}", result.p_value);
assert!(!result.rejected);
}
#[test]
fn jarque_bera_insufficient() {
assert!(jarque_bera(&[1.0, 2.0, 3.0]).is_none());
}
#[test]
fn shapiro_wilk_normal_data() {
let data = nearly_normal_data();
let result = shapiro_wilk(&data).unwrap();
assert!(result.statistic > 0.9, "W = {}", result.statistic);
assert!(result.p_value > 0.05, "p = {}", result.p_value);
assert!(!result.rejected);
}
#[test]
fn shapiro_wilk_insufficient() {
assert!(shapiro_wilk(&[1.0, 2.0]).is_none());
}
#[test]
fn shapiro_wilk_constant() {
assert!(shapiro_wilk(&[5.0, 5.0, 5.0, 5.0]).is_none());
}
#[test]
fn anderson_darling_normal_data() {
let data = nearly_normal_data();
let result = anderson_darling(&data).unwrap();
assert!(result.p_value > 0.05, "p = {}", result.p_value);
assert!(!result.rejected);
}
#[test]
fn anderson_darling_insufficient() {
assert!(anderson_darling(&[1.0, 2.0, 3.0]).is_none());
}
#[test]
fn fit_distributions_positive_data() {
let data: Vec<f64> = (1..=50).map(|i| i as f64).collect();
let fits = fit_distributions(&data);
assert!(!fits.is_empty());
for w in fits.windows(2) {
assert!(
w[0].aic <= w[1].aic,
"AIC not sorted: {} > {}",
w[0].aic,
w[1].aic
);
}
for f in &fits {
assert!(!f.distribution.is_empty());
assert!(!f.parameters.is_empty());
assert!(f.n_params > 0);
}
}
#[test]
fn fit_distributions_in_analysis() {
let data: Vec<f64> = (1..=30).map(|i| i as f64).collect();
let config = DistributionConfig {
fit_distributions: true,
..Default::default()
};
let result = distribution_analysis(&data, &config).unwrap();
assert!(!result.fits.is_empty());
assert!(!result.fits[0].distribution.is_empty());
}
#[test]
fn fit_distributions_default_off() {
let data = nearly_normal_data();
let config = DistributionConfig::default();
let result = distribution_analysis(&data, &config).unwrap();
assert!(result.fits.is_empty());
}
#[test]
fn fit_distributions_empty() {
let fits = fit_distributions(&[]);
assert!(fits.is_empty());
}
#[test]
fn normality_conservative_verdict() {
let data = vec![0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0];
let config = DistributionConfig::default();
let result = distribution_analysis(&data, &config).unwrap();
assert!(result.normality.ks_test.is_some());
assert!(result.normality.jarque_bera.is_none()); }
#[test]
fn normality_no_tests_available() {
let data = vec![1.0, 2.0];
let config = DistributionConfig::default();
let result = distribution_analysis(&data, &config).unwrap();
assert!(result.normality.ks_test.is_none());
assert!(result.normality.jarque_bera.is_none());
assert!(!result.normality.is_normal); }
#[test]
fn bin_method_selection() {
let data: Vec<f64> = (0..100).map(|i| i as f64).collect();
let sturges = histogram(&data, BinMethod::Sturges).unwrap();
let fd = histogram(&data, BinMethod::FreedmanDiaconis).unwrap();
assert!(sturges.n_bins >= 2);
assert!(fd.n_bins >= 2);
assert_eq!(sturges.counts.iter().sum::<usize>(), 100);
assert_eq!(fd.counts.iter().sum::<usize>(), 100);
}
#[test]
fn grubbs_detects_clear_outlier() {
let data = [2.0, 2.1, 2.2, 2.0, 2.1, 2.3, 50.0];
let result = grubbs_test(&data, 0.05).unwrap();
assert_eq!(result.outlier_index, 6);
assert!(result.is_outlier);
assert!((result.outlier_value - 50.0).abs() < 1e-10);
}
#[test]
fn grubbs_no_outlier_in_uniform() {
let data = [1.0, 1.1, 1.2, 0.9, 1.05, 0.95, 1.15];
let result = grubbs_test(&data, 0.05).unwrap();
assert!(!result.is_outlier);
}
#[test]
fn grubbs_too_few_points() {
assert!(grubbs_test(&[1.0, 2.0], 0.05).is_none());
assert!(grubbs_test(&[], 0.05).is_none());
}
#[test]
fn grubbs_nan_returns_none() {
let data = [1.0, f64::NAN, 3.0];
assert!(grubbs_test(&data, 0.05).is_none());
}
#[test]
fn grubbs_constant_data_returns_none() {
let data = [5.0, 5.0, 5.0, 5.0, 5.0];
assert!(grubbs_test(&data, 0.05).is_none());
}
#[test]
fn grubbs_statistic_positive() {
let data = [1.0, 2.0, 3.0, 4.0, 5.0, 100.0];
let result = grubbs_test(&data, 0.05).unwrap();
assert!(result.statistic > 0.0);
assert!(result.critical_value > 0.0);
}
#[test]
fn grubbs_stricter_alpha() {
let data = [1.0, 2.0, 3.0, 4.0, 5.0, 10.0];
let lenient = grubbs_test(&data, 0.10).unwrap();
let strict = grubbs_test(&data, 0.01).unwrap();
assert!((lenient.statistic - strict.statistic).abs() < 1e-10);
assert!(strict.critical_value >= lenient.critical_value);
}
}