Skip to main content

cyanea_stats/
descriptive.rs

1//! Descriptive statistics for numeric data.
2//!
3//! Provides individual functions ([`mean`], [`median`], [`variance`], etc.) and
4//! the aggregate [`describe`] function that computes all common statistics in
5//! one pass.
6
7use cyanea_core::{CyaneaError, Result, Summarizable};
8
9/// Aggregate descriptive statistics for a numeric sample.
10#[derive(Debug, Clone)]
11pub struct DescriptiveStats {
12    /// Number of observations.
13    pub count: usize,
14    /// Arithmetic mean.
15    pub mean: f64,
16    /// Median (50th percentile).
17    pub median: f64,
18    /// Population variance (ddof=0).
19    pub variance: f64,
20    /// Sample variance (ddof=1).
21    pub sample_variance: f64,
22    /// Population standard deviation.
23    pub std_dev: f64,
24    /// Sample standard deviation.
25    pub sample_std_dev: f64,
26    /// Minimum value.
27    pub min: f64,
28    /// Maximum value.
29    pub max: f64,
30    /// Range (max - min).
31    pub range: f64,
32    /// First quartile (25th percentile).
33    pub q1: f64,
34    /// Third quartile (75th percentile).
35    pub q3: f64,
36    /// Interquartile range (q3 - q1).
37    pub iqr: f64,
38    /// Skewness (Fisher's definition, population).
39    pub skewness: f64,
40    /// Excess kurtosis (population).
41    pub kurtosis: f64,
42}
43
44impl Summarizable for DescriptiveStats {
45    fn summary(&self) -> String {
46        format!(
47            "n={}, mean={:.4}, std={:.4}, min={:.4}, max={:.4}",
48            self.count, self.mean, self.std_dev, self.min, self.max,
49        )
50    }
51}
52
53/// Compute all descriptive statistics for `data`.
54///
55/// Requires at least 1 element. Sample variance/std_dev require at least 2.
56pub fn describe(data: &[f64]) -> Result<DescriptiveStats> {
57    if data.is_empty() {
58        return Err(CyaneaError::InvalidInput(
59            "describe: data must not be empty".into(),
60        ));
61    }
62
63    let n = data.len();
64    let n_f = n as f64;
65
66    // First pass: mean, min, max
67    let mut sum = 0.0;
68    let mut min_val = f64::INFINITY;
69    let mut max_val = f64::NEG_INFINITY;
70    for &x in data {
71        sum += x;
72        if x < min_val {
73            min_val = x;
74        }
75        if x > max_val {
76            max_val = x;
77        }
78    }
79    let mean_val = sum / n_f;
80
81    // Second pass: variance, skewness, kurtosis
82    let mut m2 = 0.0;
83    let mut m3 = 0.0;
84    let mut m4 = 0.0;
85    for &x in data {
86        let d = x - mean_val;
87        m2 += d * d;
88        m3 += d * d * d;
89        m4 += d * d * d * d;
90    }
91
92    let pop_var = m2 / n_f;
93    let sample_var = if n > 1 { m2 / (n_f - 1.0) } else { f64::NAN };
94    let pop_std = pop_var.sqrt();
95    let sample_std = sample_var.sqrt();
96
97    let skewness = if pop_std > 0.0 {
98        (m3 / n_f) / (pop_std * pop_std * pop_std)
99    } else {
100        0.0
101    };
102
103    let kurtosis = if pop_std > 0.0 {
104        (m4 / n_f) / (pop_var * pop_var) - 3.0
105    } else {
106        0.0
107    };
108
109    // Sort a clone for quantiles
110    let mut sorted = data.to_vec();
111    sorted.sort_by(|a, b| a.total_cmp(b));
112
113    let median_val = compute_quantile_sorted(&sorted, 0.5);
114    let q1_val = compute_quantile_sorted(&sorted, 0.25);
115    let q3_val = compute_quantile_sorted(&sorted, 0.75);
116
117    Ok(DescriptiveStats {
118        count: n,
119        mean: mean_val,
120        median: median_val,
121        variance: pop_var,
122        sample_variance: sample_var,
123        std_dev: pop_std,
124        sample_std_dev: sample_std,
125        min: min_val,
126        max: max_val,
127        range: max_val - min_val,
128        q1: q1_val,
129        q3: q3_val,
130        iqr: q3_val - q1_val,
131        skewness,
132        kurtosis,
133    })
134}
135
136// ── Individual functions ───────────────────────────────────────────────────
137
138/// Arithmetic mean.
139pub fn mean(data: &[f64]) -> Result<f64> {
140    if data.is_empty() {
141        return Err(CyaneaError::InvalidInput(
142            "mean: data must not be empty".into(),
143        ));
144    }
145    Ok(data.iter().sum::<f64>() / data.len() as f64)
146}
147
148/// Median (50th percentile).
149pub fn median(data: &[f64]) -> Result<f64> {
150    if data.is_empty() {
151        return Err(CyaneaError::InvalidInput(
152            "median: data must not be empty".into(),
153        ));
154    }
155    let mut sorted = data.to_vec();
156    sorted.sort_by(|a, b| a.total_cmp(b));
157    Ok(compute_quantile_sorted(&sorted, 0.5))
158}
159
160/// Variance with given degrees-of-freedom correction.
161///
162/// - `ddof = 0` → population variance
163/// - `ddof = 1` → sample variance (Bessel's correction)
164pub fn variance(data: &[f64], ddof: usize) -> Result<f64> {
165    let n = data.len();
166    if n <= ddof {
167        return Err(CyaneaError::InvalidInput(format!(
168            "variance: need more than {} observations (got {})",
169            ddof, n,
170        )));
171    }
172    let m = mean(data)?;
173    let ss: f64 = data.iter().map(|&x| (x - m).powi(2)).sum();
174    Ok(ss / (n - ddof) as f64)
175}
176
177/// Standard deviation with given degrees-of-freedom correction.
178pub fn std_dev(data: &[f64], ddof: usize) -> Result<f64> {
179    Ok(variance(data, ddof)?.sqrt())
180}
181
182/// Quantile using linear interpolation (exclusive method).
183pub fn quantile(data: &[f64], q: f64) -> Result<f64> {
184    if data.is_empty() {
185        return Err(CyaneaError::InvalidInput(
186            "quantile: data must not be empty".into(),
187        ));
188    }
189    if !(0.0..=1.0).contains(&q) {
190        return Err(CyaneaError::InvalidInput(
191            "quantile: q must be in [0, 1]".into(),
192        ));
193    }
194    let mut sorted = data.to_vec();
195    sorted.sort_by(|a, b| a.total_cmp(b));
196    Ok(compute_quantile_sorted(&sorted, q))
197}
198
199/// Interquartile range (Q3 - Q1).
200pub fn iqr(data: &[f64]) -> Result<f64> {
201    let q1 = quantile(data, 0.25)?;
202    let q3 = quantile(data, 0.75)?;
203    Ok(q3 - q1)
204}
205
206/// Median absolute deviation (MAD).
207pub fn mad(data: &[f64]) -> Result<f64> {
208    let med = median(data)?;
209    let deviations: Vec<f64> = data.iter().map(|&x| (x - med).abs()).collect();
210    median(&deviations)
211}
212
213// ── Internal ───────────────────────────────────────────────────────────────
214
215/// Compute a quantile from a pre-sorted slice using linear interpolation.
216fn compute_quantile_sorted(sorted: &[f64], q: f64) -> f64 {
217    let n = sorted.len();
218    if n == 1 {
219        return sorted[0];
220    }
221    let pos = q * (n - 1) as f64;
222    let lo = pos.floor() as usize;
223    let hi = lo + 1;
224    let frac = pos - lo as f64;
225    if hi >= n {
226        sorted[n - 1]
227    } else {
228        sorted[lo] * (1.0 - frac) + sorted[hi] * frac
229    }
230}
231
232// ── Tests ──────────────────────────────────────────────────────────────────
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237
238    const TOL: f64 = 1e-10;
239
240    #[test]
241    fn describe_known_data() {
242        let data = [1.0, 2.0, 3.0, 4.0, 5.0];
243        let stats = describe(&data).unwrap();
244        assert_eq!(stats.count, 5);
245        assert!((stats.mean - 3.0).abs() < TOL);
246        assert!((stats.median - 3.0).abs() < TOL);
247        assert!((stats.min - 1.0).abs() < TOL);
248        assert!((stats.max - 5.0).abs() < TOL);
249        assert!((stats.range - 4.0).abs() < TOL);
250        // Population variance of [1,2,3,4,5] = 2.0
251        assert!((stats.variance - 2.0).abs() < TOL);
252        // Sample variance = 2.5
253        assert!((stats.sample_variance - 2.5).abs() < TOL);
254    }
255
256    #[test]
257    fn describe_single() {
258        let data = [42.0];
259        let stats = describe(&data).unwrap();
260        assert_eq!(stats.count, 1);
261        assert!((stats.mean - 42.0).abs() < TOL);
262        assert!((stats.variance - 0.0).abs() < TOL);
263        assert!(stats.sample_variance.is_nan());
264    }
265
266    #[test]
267    fn describe_empty() {
268        assert!(describe(&[]).is_err());
269    }
270
271    #[test]
272    fn mean_basic() {
273        assert!((mean(&[2.0, 4.0, 6.0]).unwrap() - 4.0).abs() < TOL);
274    }
275
276    #[test]
277    fn mean_empty() {
278        assert!(mean(&[]).is_err());
279    }
280
281    #[test]
282    fn median_odd() {
283        assert!((median(&[3.0, 1.0, 2.0]).unwrap() - 2.0).abs() < TOL);
284    }
285
286    #[test]
287    fn median_even() {
288        assert!((median(&[4.0, 1.0, 3.0, 2.0]).unwrap() - 2.5).abs() < TOL);
289    }
290
291    #[test]
292    fn variance_population() {
293        let data = [2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0];
294        assert!((variance(&data, 0).unwrap() - 4.0).abs() < TOL);
295    }
296
297    #[test]
298    fn variance_sample() {
299        let data = [2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0];
300        let expected = 32.0 / 7.0; // ≈ 4.571
301        assert!((variance(&data, 1).unwrap() - expected).abs() < TOL);
302    }
303
304    #[test]
305    fn variance_too_few() {
306        assert!(variance(&[1.0], 1).is_err());
307        assert!(variance(&[], 0).is_err());
308    }
309
310    #[test]
311    fn quantile_basic() {
312        let data = [1.0, 2.0, 3.0, 4.0, 5.0];
313        assert!((quantile(&data, 0.0).unwrap() - 1.0).abs() < TOL);
314        assert!((quantile(&data, 1.0).unwrap() - 5.0).abs() < TOL);
315        assert!((quantile(&data, 0.5).unwrap() - 3.0).abs() < TOL);
316    }
317
318    #[test]
319    fn quantile_invalid_q() {
320        assert!(quantile(&[1.0], -0.1).is_err());
321        assert!(quantile(&[1.0], 1.1).is_err());
322    }
323
324    #[test]
325    fn iqr_basic() {
326        let data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
327        let result = iqr(&data).unwrap();
328        // Q1 = 2.75, Q3 = 6.25, IQR = 3.5
329        assert!((result - 3.5).abs() < TOL);
330    }
331
332    #[test]
333    fn mad_basic() {
334        let data = [1.0, 1.0, 2.0, 2.0, 4.0, 6.0, 9.0];
335        // median = 2, deviations = [1,1,0,0,2,4,7], median of deviations = 1
336        assert!((mad(&data).unwrap() - 1.0).abs() < TOL);
337    }
338
339    #[test]
340    fn describe_skewness_symmetric() {
341        // Symmetric data should have skewness ≈ 0
342        let data = [1.0, 2.0, 3.0, 4.0, 5.0];
343        let stats = describe(&data).unwrap();
344        assert!((stats.skewness).abs() < TOL);
345    }
346
347    #[test]
348    fn summarizable_impl() {
349        let stats = describe(&[1.0, 2.0, 3.0]).unwrap();
350        let s = stats.summary();
351        assert!(s.contains("n=3"));
352        assert!(s.contains("mean="));
353    }
354}
355
356#[cfg(test)]
357mod proptests {
358    use super::*;
359    use proptest::prelude::*;
360
361    fn finite_f64() -> impl Strategy<Value = f64> {
362        -1e12_f64..1e12_f64
363    }
364
365    proptest! {
366        #[test]
367        fn describe_count_equals_length(
368            data in proptest::collection::vec(finite_f64(), 1..=500)
369        ) {
370            let stats = describe(&data).unwrap();
371            prop_assert_eq!(stats.count, data.len());
372        }
373
374        #[test]
375        fn variance_nonnegative(
376            data in proptest::collection::vec(finite_f64(), 1..=500)
377        ) {
378            let stats = describe(&data).unwrap();
379            prop_assert!(stats.variance >= 0.0, "variance={} should be >= 0", stats.variance);
380        }
381
382        #[test]
383        fn min_le_mean_le_max(
384            data in proptest::collection::vec(finite_f64(), 1..=500)
385        ) {
386            let stats = describe(&data).unwrap();
387            prop_assert!(stats.min <= stats.mean + 1e-10,
388                "min={} > mean={}", stats.min, stats.mean);
389            prop_assert!(stats.mean <= stats.max + 1e-10,
390                "mean={} > max={}", stats.mean, stats.max);
391        }
392
393        #[test]
394        fn range_equals_max_minus_min(
395            data in proptest::collection::vec(finite_f64(), 1..=500)
396        ) {
397            let stats = describe(&data).unwrap();
398            prop_assert!((stats.range - (stats.max - stats.min)).abs() < 1e-10);
399        }
400
401        #[test]
402        fn median_between_min_and_max(
403            data in proptest::collection::vec(finite_f64(), 1..=500)
404        ) {
405            let stats = describe(&data).unwrap();
406            prop_assert!(stats.median >= stats.min - 1e-10,
407                "median={} < min={}", stats.median, stats.min);
408            prop_assert!(stats.median <= stats.max + 1e-10,
409                "median={} > max={}", stats.median, stats.max);
410        }
411    }
412}