Skip to main content

numra_stats/
descriptive.rs

1//! Descriptive statistics: mean, variance, standard deviation, percentiles, etc.
2//!
3//! Author: Moussa Leblouba
4//! Date: 9 February 2026
5//! Modified: 2 May 2026
6
7use numra_core::Scalar;
8
9use crate::error::StatsError;
10
11/// Arithmetic mean.
12pub fn mean<S: Scalar>(data: &[S]) -> Result<S, StatsError> {
13    if data.is_empty() {
14        return Err(StatsError::EmptyData);
15    }
16    let n = S::from_usize(data.len());
17    let sum: S = data.iter().copied().fold(S::ZERO, |a, b| a + b);
18    Ok(sum / n)
19}
20
21/// Sample variance (with Bessel's correction, divides by N-1).
22pub fn variance<S: Scalar>(data: &[S]) -> Result<S, StatsError> {
23    if data.len() < 2 {
24        return Err(StatsError::EmptyData);
25    }
26    let m = mean(data)?;
27    let n = S::from_usize(data.len());
28    let sum_sq: S = data
29        .iter()
30        .copied()
31        .fold(S::ZERO, |a, x| a + (x - m) * (x - m));
32    Ok(sum_sq / (n - S::ONE))
33}
34
35/// Sample standard deviation.
36pub fn std_dev<S: Scalar>(data: &[S]) -> Result<S, StatsError> {
37    Ok(variance(data)?.sqrt())
38}
39
40/// Median (middle value of sorted data).
41pub fn median<S: Scalar>(data: &[S]) -> Result<S, StatsError> {
42    if data.is_empty() {
43        return Err(StatsError::EmptyData);
44    }
45    let mut sorted: Vec<S> = data.to_vec();
46    sorted.sort_by(|a, b| a.to_f64().partial_cmp(&b.to_f64()).unwrap());
47    let n = sorted.len();
48    if n % 2 == 1 {
49        Ok(sorted[n / 2])
50    } else {
51        let half = S::HALF;
52        Ok((sorted[n / 2 - 1] + sorted[n / 2]) * half)
53    }
54}
55
56/// Percentile (p in [0, 100]) using linear interpolation.
57pub fn percentile<S: Scalar>(data: &[S], p: S) -> Result<S, StatsError> {
58    if data.is_empty() {
59        return Err(StatsError::EmptyData);
60    }
61    let p_f64 = p.to_f64();
62    if !(0.0..=100.0).contains(&p_f64) {
63        return Err(StatsError::InvalidParameter(
64            "percentile must be in [0, 100]".into(),
65        ));
66    }
67    let mut sorted: Vec<S> = data.to_vec();
68    sorted.sort_by(|a, b| a.to_f64().partial_cmp(&b.to_f64()).unwrap());
69    let n = sorted.len();
70    if n == 1 {
71        return Ok(sorted[0]);
72    }
73    // Linear interpolation
74    let rank = p_f64 / 100.0 * (n - 1) as f64;
75    let lo = rank.floor() as usize;
76    let hi = rank.ceil() as usize;
77    if lo == hi {
78        Ok(sorted[lo])
79    } else {
80        let frac = S::from_f64(rank - lo as f64);
81        Ok(sorted[lo] + frac * (sorted[hi] - sorted[lo]))
82    }
83}
84
85/// Skewness (Fisher's definition, adjusted for sample bias).
86pub fn skewness<S: Scalar>(data: &[S]) -> Result<S, StatsError> {
87    if data.len() < 3 {
88        return Err(StatsError::EmptyData);
89    }
90    let m = mean(data)?;
91    let n = data.len() as f64;
92    let s = std_dev(data)?;
93    let m3: S = data
94        .iter()
95        .copied()
96        .fold(S::ZERO, |a, x| a + (x - m).powi(3));
97    let m3_avg = m3 / S::from_f64(n);
98    let s3 = s * s * s;
99    // Adjusted skewness: n / ((n-1)(n-2)) * sum((x-mean)^3) / s^3
100    let adjust = n * n / ((n - 1.0) * (n - 2.0));
101    Ok(m3_avg / s3 * S::from_f64(adjust))
102}
103
104/// Excess kurtosis (Fisher's definition).
105pub fn kurtosis<S: Scalar>(data: &[S]) -> Result<S, StatsError> {
106    if data.len() < 4 {
107        return Err(StatsError::EmptyData);
108    }
109    let m = mean(data)?;
110    let n = data.len() as f64;
111    let v = variance(data)?;
112    let m4: S = data
113        .iter()
114        .copied()
115        .fold(S::ZERO, |a, x| a + (x - m).powi(4));
116    let m4_avg = m4 / S::from_f64(n);
117    let v2 = v * v;
118    // Adjusted excess kurtosis
119    let term1 = n * (n + 1.0) / ((n - 1.0) * (n - 2.0) * (n - 3.0));
120    let term2 = 3.0 * (n - 1.0) * (n - 1.0) / ((n - 2.0) * (n - 3.0));
121    // Using sample moments: n * sum((x-mean)^4) / (sum((x-mean)^2)^2) - 3 (plus bias correction)
122    Ok(S::from_f64(term1) * m4_avg * S::from_f64(n) / v2 - S::from_f64(term2))
123}
124
125/// Sample covariance of two data sets.
126pub fn covariance<S: Scalar>(x: &[S], y: &[S]) -> Result<S, StatsError> {
127    if x.is_empty() {
128        return Err(StatsError::EmptyData);
129    }
130    if x.len() != y.len() {
131        return Err(StatsError::LengthMismatch {
132            expected: x.len(),
133            got: y.len(),
134        });
135    }
136    if x.len() < 2 {
137        return Err(StatsError::EmptyData);
138    }
139    let mx = mean(x)?;
140    let my = mean(y)?;
141    let n = S::from_usize(x.len());
142    let sum: S = x
143        .iter()
144        .zip(y.iter())
145        .fold(S::ZERO, |a, (&xi, &yi)| a + (xi - mx) * (yi - my));
146    Ok(sum / (n - S::ONE))
147}
148
149/// Covariance matrix for p variables, each with n observations.
150///
151/// `data` is a slice of p vectors, each of length n.
152/// Returns a row-major p x p matrix.
153pub fn covariance_matrix<S: Scalar>(data: &[Vec<S>]) -> Result<Vec<S>, StatsError> {
154    if data.is_empty() {
155        return Err(StatsError::EmptyData);
156    }
157    let p = data.len();
158    let n = data[0].len();
159    for v in data {
160        if v.len() != n {
161            return Err(StatsError::LengthMismatch {
162                expected: n,
163                got: v.len(),
164            });
165        }
166    }
167    if n < 2 {
168        return Err(StatsError::EmptyData);
169    }
170
171    let mut cov = vec![S::ZERO; p * p];
172    for i in 0..p {
173        for j in i..p {
174            let c = covariance(&data[i], &data[j])?;
175            cov[i * p + j] = c;
176            cov[j * p + i] = c;
177        }
178    }
179    Ok(cov)
180}
181
182#[cfg(test)]
183mod tests {
184    use super::*;
185
186    #[test]
187    fn test_mean() {
188        let data = vec![1.0_f64, 2.0, 3.0, 4.0, 5.0];
189        assert!((mean(&data).unwrap() - 3.0).abs() < 1e-14);
190    }
191
192    #[test]
193    fn test_variance() {
194        let data = vec![2.0_f64, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0];
195        assert!((variance(&data).unwrap() - 4.571428571428571).abs() < 1e-10);
196    }
197
198    #[test]
199    fn test_std_dev() {
200        let data = vec![2.0_f64, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0];
201        let s = std_dev(&data).unwrap();
202        assert!((s * s - variance(&data).unwrap()).abs() < 1e-12);
203    }
204
205    #[test]
206    fn test_median_odd() {
207        let data = vec![3.0_f64, 1.0, 2.0];
208        assert!((median(&data).unwrap() - 2.0).abs() < 1e-14);
209    }
210
211    #[test]
212    fn test_median_even() {
213        let data = vec![1.0_f64, 2.0, 3.0, 4.0];
214        assert!((median(&data).unwrap() - 2.5).abs() < 1e-14);
215    }
216
217    #[test]
218    fn test_percentile() {
219        let data = vec![1.0_f64, 2.0, 3.0, 4.0, 5.0];
220        assert!((percentile(&data, 0.0).unwrap() - 1.0).abs() < 1e-14);
221        assert!((percentile(&data, 50.0).unwrap() - 3.0).abs() < 1e-14);
222        assert!((percentile(&data, 100.0).unwrap() - 5.0).abs() < 1e-14);
223    }
224
225    #[test]
226    fn test_skewness_symmetric() {
227        // Symmetric distribution should have ~0 skewness
228        let data = vec![1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0];
229        assert!(skewness(&data).unwrap().abs() < 1e-10);
230    }
231
232    #[test]
233    fn test_kurtosis_normal_like() {
234        // For a uniform-ish distribution, kurtosis should be negative
235        let data: Vec<f64> = (0..100).map(|i| i as f64).collect();
236        let k = kurtosis(&data).unwrap();
237        assert!(k < 0.0, "kurtosis = {}", k); // Uniform has excess kurtosis = -1.2
238    }
239
240    #[test]
241    fn test_covariance_self() {
242        let data = vec![1.0_f64, 2.0, 3.0, 4.0, 5.0];
243        let cov = covariance(&data, &data).unwrap();
244        let var = variance(&data).unwrap();
245        assert!((cov - var).abs() < 1e-12);
246    }
247
248    #[test]
249    fn test_covariance_matrix_diagonal() {
250        let x = vec![1.0_f64, 2.0, 3.0, 4.0, 5.0];
251        let y = vec![5.0_f64, 4.0, 3.0, 2.0, 1.0];
252        let cov = covariance_matrix(&[x.clone(), y.clone()]).unwrap();
253        assert_eq!(cov.len(), 4);
254        assert!((cov[0] - variance(&x).unwrap()).abs() < 1e-12);
255        assert!((cov[3] - variance(&y).unwrap()).abs() < 1e-12);
256        // Off-diagonal should be negative (inversely correlated)
257        assert!(cov[1] < 0.0);
258        assert!((cov[1] - cov[2]).abs() < 1e-12); // symmetric
259    }
260
261    #[test]
262    fn test_empty_data() {
263        assert!(mean::<f64>(&[]).is_err());
264        assert!(variance::<f64>(&[]).is_err());
265        assert!(median::<f64>(&[]).is_err());
266    }
267}