use cyanea_core::{CyaneaError, Result, Summarizable};
#[derive(Debug, Clone)]
pub struct DescriptiveStats {
pub count: usize,
pub mean: f64,
pub median: f64,
pub variance: f64,
pub sample_variance: f64,
pub std_dev: f64,
pub sample_std_dev: f64,
pub min: f64,
pub max: f64,
pub range: f64,
pub q1: f64,
pub q3: f64,
pub iqr: f64,
pub skewness: f64,
pub kurtosis: f64,
}
impl Summarizable for DescriptiveStats {
fn summary(&self) -> String {
format!(
"n={}, mean={:.4}, std={:.4}, min={:.4}, max={:.4}",
self.count, self.mean, self.std_dev, self.min, self.max,
)
}
}
pub fn describe(data: &[f64]) -> Result<DescriptiveStats> {
if data.is_empty() {
return Err(CyaneaError::InvalidInput(
"describe: data must not be empty".into(),
));
}
let n = data.len();
let n_f = n as f64;
let mut sum = 0.0;
let mut min_val = f64::INFINITY;
let mut max_val = f64::NEG_INFINITY;
for &x in data {
sum += x;
if x < min_val {
min_val = x;
}
if x > max_val {
max_val = x;
}
}
let mean_val = sum / n_f;
let mut m2 = 0.0;
let mut m3 = 0.0;
let mut m4 = 0.0;
for &x in data {
let d = x - mean_val;
m2 += d * d;
m3 += d * d * d;
m4 += d * d * d * d;
}
let pop_var = m2 / n_f;
let sample_var = if n > 1 { m2 / (n_f - 1.0) } else { f64::NAN };
let pop_std = pop_var.sqrt();
let sample_std = sample_var.sqrt();
let skewness = if pop_std > 0.0 {
(m3 / n_f) / (pop_std * pop_std * pop_std)
} else {
0.0
};
let kurtosis = if pop_std > 0.0 {
(m4 / n_f) / (pop_var * pop_var) - 3.0
} else {
0.0
};
let mut sorted = data.to_vec();
sorted.sort_by(|a, b| a.total_cmp(b));
let median_val = compute_quantile_sorted(&sorted, 0.5);
let q1_val = compute_quantile_sorted(&sorted, 0.25);
let q3_val = compute_quantile_sorted(&sorted, 0.75);
Ok(DescriptiveStats {
count: n,
mean: mean_val,
median: median_val,
variance: pop_var,
sample_variance: sample_var,
std_dev: pop_std,
sample_std_dev: sample_std,
min: min_val,
max: max_val,
range: max_val - min_val,
q1: q1_val,
q3: q3_val,
iqr: q3_val - q1_val,
skewness,
kurtosis,
})
}
pub fn mean(data: &[f64]) -> Result<f64> {
if data.is_empty() {
return Err(CyaneaError::InvalidInput(
"mean: data must not be empty".into(),
));
}
Ok(data.iter().sum::<f64>() / data.len() as f64)
}
pub fn median(data: &[f64]) -> Result<f64> {
if data.is_empty() {
return Err(CyaneaError::InvalidInput(
"median: data must not be empty".into(),
));
}
let mut sorted = data.to_vec();
sorted.sort_by(|a, b| a.total_cmp(b));
Ok(compute_quantile_sorted(&sorted, 0.5))
}
pub fn variance(data: &[f64], ddof: usize) -> Result<f64> {
let n = data.len();
if n <= ddof {
return Err(CyaneaError::InvalidInput(format!(
"variance: need more than {} observations (got {})",
ddof, n,
)));
}
let m = mean(data)?;
let ss: f64 = data.iter().map(|&x| (x - m).powi(2)).sum();
Ok(ss / (n - ddof) as f64)
}
pub fn std_dev(data: &[f64], ddof: usize) -> Result<f64> {
Ok(variance(data, ddof)?.sqrt())
}
pub fn quantile(data: &[f64], q: f64) -> Result<f64> {
if data.is_empty() {
return Err(CyaneaError::InvalidInput(
"quantile: data must not be empty".into(),
));
}
if !(0.0..=1.0).contains(&q) {
return Err(CyaneaError::InvalidInput(
"quantile: q must be in [0, 1]".into(),
));
}
let mut sorted = data.to_vec();
sorted.sort_by(|a, b| a.total_cmp(b));
Ok(compute_quantile_sorted(&sorted, q))
}
pub fn iqr(data: &[f64]) -> Result<f64> {
let q1 = quantile(data, 0.25)?;
let q3 = quantile(data, 0.75)?;
Ok(q3 - q1)
}
pub fn mad(data: &[f64]) -> Result<f64> {
let med = median(data)?;
let deviations: Vec<f64> = data.iter().map(|&x| (x - med).abs()).collect();
median(&deviations)
}
fn compute_quantile_sorted(sorted: &[f64], q: f64) -> f64 {
let n = sorted.len();
if n == 1 {
return sorted[0];
}
let pos = q * (n - 1) as f64;
let lo = pos.floor() as usize;
let hi = lo + 1;
let frac = pos - lo as f64;
if hi >= n {
sorted[n - 1]
} else {
sorted[lo] * (1.0 - frac) + sorted[hi] * frac
}
}
#[cfg(test)]
mod tests {
use super::*;
const TOL: f64 = 1e-10;
#[test]
fn describe_known_data() {
let data = [1.0, 2.0, 3.0, 4.0, 5.0];
let stats = describe(&data).unwrap();
assert_eq!(stats.count, 5);
assert!((stats.mean - 3.0).abs() < TOL);
assert!((stats.median - 3.0).abs() < TOL);
assert!((stats.min - 1.0).abs() < TOL);
assert!((stats.max - 5.0).abs() < TOL);
assert!((stats.range - 4.0).abs() < TOL);
assert!((stats.variance - 2.0).abs() < TOL);
assert!((stats.sample_variance - 2.5).abs() < TOL);
}
#[test]
fn describe_single() {
let data = [42.0];
let stats = describe(&data).unwrap();
assert_eq!(stats.count, 1);
assert!((stats.mean - 42.0).abs() < TOL);
assert!((stats.variance - 0.0).abs() < TOL);
assert!(stats.sample_variance.is_nan());
}
#[test]
fn describe_empty() {
assert!(describe(&[]).is_err());
}
#[test]
fn mean_basic() {
assert!((mean(&[2.0, 4.0, 6.0]).unwrap() - 4.0).abs() < TOL);
}
#[test]
fn mean_empty() {
assert!(mean(&[]).is_err());
}
#[test]
fn median_odd() {
assert!((median(&[3.0, 1.0, 2.0]).unwrap() - 2.0).abs() < TOL);
}
#[test]
fn median_even() {
assert!((median(&[4.0, 1.0, 3.0, 2.0]).unwrap() - 2.5).abs() < TOL);
}
#[test]
fn variance_population() {
let data = [2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0];
assert!((variance(&data, 0).unwrap() - 4.0).abs() < TOL);
}
#[test]
fn variance_sample() {
let data = [2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0];
let expected = 32.0 / 7.0; assert!((variance(&data, 1).unwrap() - expected).abs() < TOL);
}
#[test]
fn variance_too_few() {
assert!(variance(&[1.0], 1).is_err());
assert!(variance(&[], 0).is_err());
}
#[test]
fn quantile_basic() {
let data = [1.0, 2.0, 3.0, 4.0, 5.0];
assert!((quantile(&data, 0.0).unwrap() - 1.0).abs() < TOL);
assert!((quantile(&data, 1.0).unwrap() - 5.0).abs() < TOL);
assert!((quantile(&data, 0.5).unwrap() - 3.0).abs() < TOL);
}
#[test]
fn quantile_invalid_q() {
assert!(quantile(&[1.0], -0.1).is_err());
assert!(quantile(&[1.0], 1.1).is_err());
}
#[test]
fn iqr_basic() {
let data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let result = iqr(&data).unwrap();
assert!((result - 3.5).abs() < TOL);
}
#[test]
fn mad_basic() {
let data = [1.0, 1.0, 2.0, 2.0, 4.0, 6.0, 9.0];
assert!((mad(&data).unwrap() - 1.0).abs() < TOL);
}
#[test]
fn describe_skewness_symmetric() {
let data = [1.0, 2.0, 3.0, 4.0, 5.0];
let stats = describe(&data).unwrap();
assert!((stats.skewness).abs() < TOL);
}
#[test]
fn summarizable_impl() {
let stats = describe(&[1.0, 2.0, 3.0]).unwrap();
let s = stats.summary();
assert!(s.contains("n=3"));
assert!(s.contains("mean="));
}
}
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
fn finite_f64() -> impl Strategy<Value = f64> {
-1e12_f64..1e12_f64
}
proptest! {
#[test]
fn describe_count_equals_length(
data in proptest::collection::vec(finite_f64(), 1..=500)
) {
let stats = describe(&data).unwrap();
prop_assert_eq!(stats.count, data.len());
}
#[test]
fn variance_nonnegative(
data in proptest::collection::vec(finite_f64(), 1..=500)
) {
let stats = describe(&data).unwrap();
prop_assert!(stats.variance >= 0.0, "variance={} should be >= 0", stats.variance);
}
#[test]
fn min_le_mean_le_max(
data in proptest::collection::vec(finite_f64(), 1..=500)
) {
let stats = describe(&data).unwrap();
prop_assert!(stats.min <= stats.mean + 1e-10,
"min={} > mean={}", stats.min, stats.mean);
prop_assert!(stats.mean <= stats.max + 1e-10,
"mean={} > max={}", stats.mean, stats.max);
}
#[test]
fn range_equals_max_minus_min(
data in proptest::collection::vec(finite_f64(), 1..=500)
) {
let stats = describe(&data).unwrap();
prop_assert!((stats.range - (stats.max - stats.min)).abs() < 1e-10);
}
#[test]
fn median_between_min_and_max(
data in proptest::collection::vec(finite_f64(), 1..=500)
) {
let stats = describe(&data).unwrap();
prop_assert!(stats.median >= stats.min - 1e-10,
"median={} < min={}", stats.median, stats.min);
prop_assert!(stats.median <= stats.max + 1e-10,
"median={} > max={}", stats.median, stats.max);
}
}
}