Skip to main content

ferray_stats/
descriptive.rs

1// ferray-stats: Descriptive statistics — skew, kurtosis, zscore, mode,
2// iqr, sem, gmean, hmean (#470).
3//
4// scipy.stats parity for the descriptive set the issue calls out as
5// "at least these would be expected". Statistical *tests* (ttest,
6// ks_2samp, chi2_contingency, pearsonr, spearmanr) need cumulative
7// distribution functions and are a follow-up.
8
9use ferray_core::error::{FerrayError, FerrayResult};
10use ferray_core::{Array, Dimension, Element, IxDyn};
11use num_traits::Float;
12
13/// Sample skewness (third standardized moment).
14///
15/// Returns `E[((X - μ)/σ)^3]` computed with the population standard
16/// deviation (denominator `n`, matching scipy's default
17/// `bias=True` behavior).
18///
19/// # Errors
20/// `FerrayError::InvalidValue` if the array is empty or constant
21/// (variance is exactly zero).
22pub fn skew<T, D>(a: &Array<T, D>) -> FerrayResult<f64>
23where
24    T: Element + Copy + Into<f64>,
25    D: Dimension,
26{
27    let data: Vec<f64> = a.iter().copied().map(Into::into).collect();
28    if data.is_empty() {
29        return Err(FerrayError::invalid_value("skew on empty array"));
30    }
31    let n = data.len() as f64;
32    let mean = data.iter().sum::<f64>() / n;
33    let m2 = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
34    let m3 = data.iter().map(|x| (x - mean).powi(3)).sum::<f64>() / n;
35    if m2 == 0.0 {
36        return Err(FerrayError::invalid_value(
37            "skew is undefined for a constant array (zero variance)",
38        ));
39    }
40    Ok(m3 / m2.powf(1.5))
41}
42
43/// Sample kurtosis.
44///
45/// If `fisher` is `true` (default in scipy), returns excess kurtosis
46/// (subtracts 3 so a normal distribution has kurtosis 0). Otherwise
47/// returns Pearson kurtosis (`E[((X - μ)/σ)^4]`, normal = 3).
48///
49/// Computed with the population standard deviation (denominator `n`,
50/// `bias=True` in scipy).
51///
52/// # Errors
53/// `FerrayError::InvalidValue` if the array is empty or constant.
54pub fn kurtosis<T, D>(a: &Array<T, D>, fisher: bool) -> FerrayResult<f64>
55where
56    T: Element + Copy + Into<f64>,
57    D: Dimension,
58{
59    let data: Vec<f64> = a.iter().copied().map(Into::into).collect();
60    if data.is_empty() {
61        return Err(FerrayError::invalid_value("kurtosis on empty array"));
62    }
63    let n = data.len() as f64;
64    let mean = data.iter().sum::<f64>() / n;
65    let m2 = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
66    let m4 = data.iter().map(|x| (x - mean).powi(4)).sum::<f64>() / n;
67    if m2 == 0.0 {
68        return Err(FerrayError::invalid_value(
69            "kurtosis is undefined for a constant array (zero variance)",
70        ));
71    }
72    let pearson = m4 / (m2 * m2);
73    Ok(if fisher { pearson - 3.0 } else { pearson })
74}
75
76/// Per-element z-score: `(x - mean) / std`.
77///
78/// Uses the population standard deviation (denominator `n`, scipy's
79/// `ddof=0` default). Returns an array with the same shape as the
80/// input.
81///
82/// # Errors
83/// `FerrayError::InvalidValue` if the array is empty or constant.
84pub fn zscore<T, D>(a: &Array<T, D>) -> FerrayResult<Array<f64, IxDyn>>
85where
86    T: Element + Copy + Into<f64>,
87    D: Dimension,
88{
89    let data: Vec<f64> = a.iter().copied().map(Into::into).collect();
90    if data.is_empty() {
91        return Err(FerrayError::invalid_value("zscore on empty array"));
92    }
93    let n = data.len() as f64;
94    let mean = data.iter().sum::<f64>() / n;
95    let var = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
96    if var == 0.0 {
97        return Err(FerrayError::invalid_value(
98            "zscore is undefined for a constant array (zero variance)",
99        ));
100    }
101    let std = var.sqrt();
102    let out: Vec<f64> = data.iter().map(|x| (x - mean) / std).collect();
103    let shape: Vec<usize> = a.shape().to_vec();
104    Array::<f64, IxDyn>::from_vec(IxDyn::new(&shape), out)
105}
106
107/// Result of [`mode`]: the most-frequent value and its count.
108#[derive(Debug, Clone, Copy)]
109pub struct ModeResult<T> {
110    /// The most frequent value (smallest one wins on ties).
111    pub value: T,
112    /// Number of occurrences in the input.
113    pub count: u64,
114}
115
116/// Most-frequent value in an array (mode).
117///
118/// On ties, returns the smallest value (matches scipy's default).
119/// Compares values via `partial_cmp`; types like `f64` work but NaN
120/// elements are ignored.
121///
122/// # Errors
123/// `FerrayError::InvalidValue` if the array is empty or contains
124/// only NaN.
125pub fn mode<T, D>(a: &Array<T, D>) -> FerrayResult<ModeResult<T>>
126where
127    T: Element + Copy + PartialOrd,
128    D: Dimension,
129{
130    if a.is_empty() {
131        return Err(FerrayError::invalid_value("mode on empty array"));
132    }
133    // Sort (excluding NaN-like values that fail partial_cmp). After
134    // sorting, ties are run-length encoded: count consecutive equal
135    // elements, track the longest run; tiebreak by the smaller value.
136    let mut data: Vec<T> = a
137        .iter()
138        .copied()
139        .filter(|v| v.partial_cmp(v).is_some())
140        .collect();
141    if data.is_empty() {
142        return Err(FerrayError::invalid_value(
143            "mode on array with no comparable values",
144        ));
145    }
146    data.sort_by(|x, y| x.partial_cmp(y).unwrap_or(std::cmp::Ordering::Equal));
147
148    let mut best_val = data[0];
149    let mut best_count: u64 = 1;
150    let mut cur_val = data[0];
151    let mut cur_count: u64 = 1;
152    for &v in &data[1..] {
153        if v.partial_cmp(&cur_val) == Some(std::cmp::Ordering::Equal) {
154            cur_count += 1;
155        } else {
156            if cur_count > best_count {
157                best_count = cur_count;
158                best_val = cur_val;
159            }
160            cur_val = v;
161            cur_count = 1;
162        }
163    }
164    if cur_count > best_count {
165        best_count = cur_count;
166        best_val = cur_val;
167    }
168    Ok(ModeResult {
169        value: best_val,
170        count: best_count,
171    })
172}
173
174/// Interquartile range: Q75 − Q25.
175///
176/// Equivalent to `scipy.stats.iqr` with the default linear
177/// interpolation method.
178///
179/// # Errors
180/// `FerrayError::InvalidValue` if the array is empty.
181pub fn iqr<T, D>(a: &Array<T, D>) -> FerrayResult<T>
182where
183    T: Element + Float,
184    D: Dimension,
185{
186    let q75 = crate::reductions::quantile::quantile(
187        a,
188        T::from(0.75).expect("0.75 must round-trip to T"),
189        None,
190    )?;
191    let q25 = crate::reductions::quantile::quantile(
192        a,
193        T::from(0.25).expect("0.25 must round-trip to T"),
194        None,
195    )?;
196    let q75v = *q75.as_slice().unwrap().first().unwrap();
197    let q25v = *q25.as_slice().unwrap().first().unwrap();
198    Ok(q75v - q25v)
199}
200
201/// Standard error of the mean: `std / sqrt(n)`.
202///
203/// Uses the sample standard deviation (denominator `n - 1`,
204/// matching scipy's `ddof=1` default).
205///
206/// # Errors
207/// `FerrayError::InvalidValue` if the array has fewer than 2
208/// elements.
209pub fn sem<T, D>(a: &Array<T, D>) -> FerrayResult<f64>
210where
211    T: Element + Copy + Into<f64>,
212    D: Dimension,
213{
214    let data: Vec<f64> = a.iter().copied().map(Into::into).collect();
215    let n = data.len();
216    if n < 2 {
217        return Err(FerrayError::invalid_value(
218            "sem requires at least 2 elements",
219        ));
220    }
221    let nf = n as f64;
222    let mean = data.iter().sum::<f64>() / nf;
223    let var = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (nf - 1.0);
224    Ok((var / nf).sqrt())
225}
226
227/// Geometric mean: `(prod x_i) ^ (1/n)`.
228///
229/// Uses log-space to avoid overflow on large `n`. All elements must
230/// be strictly positive.
231///
232/// # Errors
233/// `FerrayError::InvalidValue` if the array is empty or contains a
234/// non-positive value.
235pub fn gmean<T, D>(a: &Array<T, D>) -> FerrayResult<f64>
236where
237    T: Element + Copy + Into<f64>,
238    D: Dimension,
239{
240    let data: Vec<f64> = a.iter().copied().map(Into::into).collect();
241    if data.is_empty() {
242        return Err(FerrayError::invalid_value("gmean on empty array"));
243    }
244    let mut log_sum = 0.0_f64;
245    for &x in &data {
246        if x <= 0.0 {
247            return Err(FerrayError::invalid_value(
248                "gmean requires all elements > 0",
249            ));
250        }
251        log_sum += x.ln();
252    }
253    Ok((log_sum / data.len() as f64).exp())
254}
255
256/// Harmonic mean: `n / sum(1/x_i)`.
257///
258/// All elements must be strictly positive.
259///
260/// # Errors
261/// `FerrayError::InvalidValue` if the array is empty or contains a
262/// non-positive value.
263pub fn hmean<T, D>(a: &Array<T, D>) -> FerrayResult<f64>
264where
265    T: Element + Copy + Into<f64>,
266    D: Dimension,
267{
268    let data: Vec<f64> = a.iter().copied().map(Into::into).collect();
269    if data.is_empty() {
270        return Err(FerrayError::invalid_value("hmean on empty array"));
271    }
272    let mut recip_sum = 0.0_f64;
273    for &x in &data {
274        if x <= 0.0 {
275            return Err(FerrayError::invalid_value(
276                "hmean requires all elements > 0",
277            ));
278        }
279        recip_sum += 1.0 / x;
280    }
281    Ok(data.len() as f64 / recip_sum)
282}
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287    use ferray_core::{Ix1, Ix2};
288
289    #[test]
290    fn skew_symmetric_zero() {
291        let a =
292            Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![-2.0, -1.0, 0.0, 1.0, 2.0]).unwrap();
293        assert!(skew(&a).unwrap().abs() < 1e-12);
294    }
295
296    #[test]
297    fn skew_right_skewed_positive() {
298        let a = Array::<f64, Ix1>::from_vec(Ix1::new([6]), vec![1.0, 1.0, 1.0, 1.0, 1.0, 10.0])
299            .unwrap();
300        assert!(skew(&a).unwrap() > 0.0);
301    }
302
303    #[test]
304    fn kurtosis_normal_fisher_near_zero() {
305        // Symmetric narrow distribution: kurtosis (Fisher) close to 0
306        // is too strong a guarantee for n=5, but Fisher version of an
307        // arithmetic progression around 0 is a fixed value (-1.3) so
308        // we check for the known answer.
309        let a =
310            Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![-2.0, -1.0, 0.0, 1.0, 2.0]).unwrap();
311        let k = kurtosis(&a, true).unwrap();
312        assert!((k - (-1.3)).abs() < 1e-12, "Fisher kurtosis = {k}");
313    }
314
315    #[test]
316    fn kurtosis_pearson_is_fisher_plus_3() {
317        let a =
318            Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![-2.0, -1.0, 0.0, 1.0, 2.0]).unwrap();
319        let f = kurtosis(&a, true).unwrap();
320        let p = kurtosis(&a, false).unwrap();
321        assert!((p - (f + 3.0)).abs() < 1e-12);
322    }
323
324    #[test]
325    fn zscore_has_mean_zero_std_one() {
326        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
327        let z = zscore(&a).unwrap();
328        let s = z.as_slice().unwrap();
329        let n = s.len() as f64;
330        let mean: f64 = s.iter().sum::<f64>() / n;
331        let std = (s.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n).sqrt();
332        assert!(mean.abs() < 1e-12);
333        assert!((std - 1.0).abs() < 1e-12);
334    }
335
336    #[test]
337    fn zscore_constant_errors() {
338        let a = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![3.0; 4]).unwrap();
339        assert!(zscore(&a).is_err());
340    }
341
342    #[test]
343    fn zscore_2d_preserves_shape() {
344        let a = Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
345            .unwrap();
346        let z = zscore(&a).unwrap();
347        assert_eq!(z.shape(), &[2, 3]);
348    }
349
350    #[test]
351    fn mode_picks_most_frequent() {
352        let a = Array::<i32, Ix1>::from_vec(Ix1::new([7]), vec![1, 2, 2, 3, 3, 3, 4]).unwrap();
353        let m = mode(&a).unwrap();
354        assert_eq!(m.value, 3);
355        assert_eq!(m.count, 3);
356    }
357
358    #[test]
359    fn mode_tiebreak_smallest_wins() {
360        let a = Array::<i32, Ix1>::from_vec(Ix1::new([6]), vec![5, 5, 7, 7, 1, 1]).unwrap();
361        let m = mode(&a).unwrap();
362        assert_eq!(m.value, 1);
363        assert_eq!(m.count, 2);
364    }
365
366    #[test]
367    fn iqr_50_percent_span() {
368        // Data 1..9: Q25 = 2.75, Q75 = 6.25 → IQR = 3.5 (numpy default).
369        let a = Array::<f64, Ix1>::from_vec(
370            Ix1::new([8]),
371            vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
372        )
373        .unwrap();
374        let v = iqr(&a).unwrap();
375        assert!((v - 3.5).abs() < 1e-12, "IQR = {v}");
376    }
377
378    #[test]
379    fn sem_constant_data_is_zero() {
380        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![3.0; 5]).unwrap();
381        assert!(sem(&a).unwrap().abs() < 1e-12);
382    }
383
384    #[test]
385    fn sem_known_value() {
386        // [1, 2, 3, 4, 5]: mean=3, ddof=1 var = 10/4 = 2.5,
387        // std = sqrt(2.5), sem = std / sqrt(5) ≈ 0.7071...
388        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
389        let s = sem(&a).unwrap();
390        let want = (2.5_f64 / 5.0).sqrt();
391        assert!((s - want).abs() < 1e-12, "sem = {s}, want {want}");
392    }
393
394    #[test]
395    fn gmean_known_value() {
396        // gmean(1, 2, 4, 8) = (1*2*4*8)^(1/4) = 64^0.25 = 2*sqrt(2)
397        let a = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![1.0, 2.0, 4.0, 8.0]).unwrap();
398        let g = gmean(&a).unwrap();
399        let want = 2.0_f64.sqrt() * 2.0;
400        assert!((g - want).abs() < 1e-12, "gmean = {g}, want {want}");
401    }
402
403    #[test]
404    fn gmean_rejects_nonpositive() {
405        let a = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![1.0, 0.0, 4.0]).unwrap();
406        assert!(gmean(&a).is_err());
407    }
408
409    #[test]
410    fn hmean_known_value() {
411        // hmean(1, 2, 4) = 3 / (1 + 0.5 + 0.25) = 3 / 1.75 = 12/7
412        let a = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![1.0, 2.0, 4.0]).unwrap();
413        let h = hmean(&a).unwrap();
414        assert!((h - 12.0 / 7.0).abs() < 1e-12, "hmean = {h}");
415    }
416
417    #[test]
418    fn hmean_rejects_nonpositive() {
419        let a = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![1.0, -1.0, 2.0]).unwrap();
420        assert!(hmean(&a).is_err());
421    }
422
423    #[test]
424    fn empty_array_errors_across_helpers() {
425        let a = Array::<f64, Ix1>::from_vec(Ix1::new([0]), Vec::new()).unwrap();
426        assert!(skew(&a).is_err());
427        assert!(kurtosis(&a, true).is_err());
428        assert!(zscore(&a).is_err());
429        assert!(mode(&a).is_err());
430        assert!(gmean(&a).is_err());
431        assert!(hmean(&a).is_err());
432        assert!(sem(&a).is_err());
433    }
434}