Skip to main content

ferray_stats/
descriptive.rs

1// ferray-stats: Descriptive statistics — skew, kurtosis, zscore, mode,
2// iqr, sem, gmean, hmean (#470).
3//
4// scipy.stats parity for the descriptive set the issue calls out as
5// "at least these would be expected". Statistical *tests* (ttest,
6// ks_2samp, chi2_contingency, pearsonr, spearmanr) need cumulative
7// distribution functions and are a follow-up.
8//
9// ## REQ status (ferray-stats descriptive, scipy.stats parity)
10// This module predates the numbered REQ-1..REQ-21 of `.design/ferray-stats.md`
11// (which cover the numpy reduction/sort/search/set surface); the descriptive
12// set is tracked as the scipy.stats parity issue #470. Status keyed by symbol:
13//  - skew / kurtosis (#470) — SHIPPED: `pub fn skew` (third standardized
14//    moment, `bias=True`/population denominator) and `pub fn kurtosis` (fourth
15//    standardized moment, `fisher` toggle), matching scipy's `skew`/`kurtosis`
16//    defaults. Consumers: re-exported as the crate's public API via
17//    `ferray-stats/src/lib.rs` (`pub use descriptive::{… kurtosis, … skew}`).
18//  - zscore (#470) — SHIPPED: `pub fn zscore` returns `(x - μ)/σ` as
19//    `Array<f64, IxDyn>`. Consumer: the `lib.rs` `pub use` re-export.
20//  - mode (#470) — SHIPPED: `pub fn mode` returns `ModeResult<T>` (smallest
21//    most-frequent value + count), matching scipy's tie rule. Consumer: the
22//    `lib.rs` `pub use` re-export (`ModeResult`, `mode`).
23//  - iqr (#470) — SHIPPED: `pub fn iqr` = `quantile(.75) - quantile(.25)`;
24//    it is itself a non-test consumer of `crate::reductions::quantile::quantile`.
25//    Consumer of `iqr`: the `lib.rs` `pub use` re-export.
26//  - sem / gmean / hmean (#470) — SHIPPED: `pub fn sem` (standard error of the
27//    mean, `ddof=1`), `pub fn gmean` (geometric mean), `pub fn hmean`
28//    (harmonic mean), matching scipy. Consumers: the `lib.rs` `pub use`
29//    re-export (`gmean`, `hmean`, `sem`).
30//  - statistical hypothesis tests (ttest / ks_2samp / chi2_contingency /
31//    pearsonr / spearmanr) — NOT-STARTED: deferred follow-up (they need CDF
32//    machinery; see the module note above). No `pub fn` here yet.
33
34use ferray_core::error::{FerrayError, FerrayResult};
35use ferray_core::{Array, Dimension, Element, IxDyn};
36use num_traits::Float;
37
38/// Sample skewness (third standardized moment).
39///
40/// Returns `E[((X - μ)/σ)^3]` computed with the population standard
41/// deviation (denominator `n`, matching scipy's default
42/// `bias=True` behavior).
43///
44/// # Errors
45/// `FerrayError::InvalidValue` if the array is empty or constant
46/// (variance is exactly zero).
47pub fn skew<T, D>(a: &Array<T, D>) -> FerrayResult<f64>
48where
49    T: Element + Copy + Into<f64>,
50    D: Dimension,
51{
52    let data: Vec<f64> = a.iter().copied().map(Into::into).collect();
53    if data.is_empty() {
54        return Err(FerrayError::invalid_value("skew on empty array"));
55    }
56    let n = data.len() as f64;
57    let mean = data.iter().sum::<f64>() / n;
58    let m2 = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
59    let m3 = data.iter().map(|x| (x - mean).powi(3)).sum::<f64>() / n;
60    if m2 == 0.0 {
61        return Err(FerrayError::invalid_value(
62            "skew is undefined for a constant array (zero variance)",
63        ));
64    }
65    Ok(m3 / m2.powf(1.5))
66}
67
68/// Sample kurtosis.
69///
70/// If `fisher` is `true` (default in scipy), returns excess kurtosis
71/// (subtracts 3 so a normal distribution has kurtosis 0). Otherwise
72/// returns Pearson kurtosis (`E[((X - μ)/σ)^4]`, normal = 3).
73///
74/// Computed with the population standard deviation (denominator `n`,
75/// `bias=True` in scipy).
76///
77/// # Errors
78/// `FerrayError::InvalidValue` if the array is empty or constant.
79pub fn kurtosis<T, D>(a: &Array<T, D>, fisher: bool) -> FerrayResult<f64>
80where
81    T: Element + Copy + Into<f64>,
82    D: Dimension,
83{
84    let data: Vec<f64> = a.iter().copied().map(Into::into).collect();
85    if data.is_empty() {
86        return Err(FerrayError::invalid_value("kurtosis on empty array"));
87    }
88    let n = data.len() as f64;
89    let mean = data.iter().sum::<f64>() / n;
90    let m2 = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
91    let m4 = data.iter().map(|x| (x - mean).powi(4)).sum::<f64>() / n;
92    if m2 == 0.0 {
93        return Err(FerrayError::invalid_value(
94            "kurtosis is undefined for a constant array (zero variance)",
95        ));
96    }
97    let pearson = m4 / (m2 * m2);
98    Ok(if fisher { pearson - 3.0 } else { pearson })
99}
100
101/// Per-element z-score: `(x - mean) / std`.
102///
103/// Uses the population standard deviation (denominator `n`, scipy's
104/// `ddof=0` default). Returns an array with the same shape as the
105/// input.
106///
107/// # Errors
108/// `FerrayError::InvalidValue` if the array is empty or constant.
109pub fn zscore<T, D>(a: &Array<T, D>) -> FerrayResult<Array<f64, IxDyn>>
110where
111    T: Element + Copy + Into<f64>,
112    D: Dimension,
113{
114    let data: Vec<f64> = a.iter().copied().map(Into::into).collect();
115    if data.is_empty() {
116        return Err(FerrayError::invalid_value("zscore on empty array"));
117    }
118    let n = data.len() as f64;
119    let mean = data.iter().sum::<f64>() / n;
120    let var = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
121    if var == 0.0 {
122        return Err(FerrayError::invalid_value(
123            "zscore is undefined for a constant array (zero variance)",
124        ));
125    }
126    let std = var.sqrt();
127    let out: Vec<f64> = data.iter().map(|x| (x - mean) / std).collect();
128    let shape: Vec<usize> = a.shape().to_vec();
129    Array::<f64, IxDyn>::from_vec(IxDyn::new(&shape), out)
130}
131
132/// Result of [`mode`]: the most-frequent value and its count.
133#[derive(Debug, Clone, Copy)]
134pub struct ModeResult<T> {
135    /// The most frequent value (smallest one wins on ties).
136    pub value: T,
137    /// Number of occurrences in the input.
138    pub count: u64,
139}
140
141/// Most-frequent value in an array (mode).
142///
143/// On ties, returns the smallest value (matches scipy's default).
144/// Compares values via `partial_cmp`; types like `f64` work but NaN
145/// elements are ignored.
146///
147/// # Errors
148/// `FerrayError::InvalidValue` if the array is empty or contains
149/// only NaN.
150pub fn mode<T, D>(a: &Array<T, D>) -> FerrayResult<ModeResult<T>>
151where
152    T: Element + Copy + PartialOrd,
153    D: Dimension,
154{
155    if a.is_empty() {
156        return Err(FerrayError::invalid_value("mode on empty array"));
157    }
158    // Sort (excluding NaN-like values that fail partial_cmp). After
159    // sorting, ties are run-length encoded: count consecutive equal
160    // elements, track the longest run; tiebreak by the smaller value.
161    let mut data: Vec<T> = a
162        .iter()
163        .copied()
164        .filter(|v| v.partial_cmp(v).is_some())
165        .collect();
166    if data.is_empty() {
167        return Err(FerrayError::invalid_value(
168            "mode on array with no comparable values",
169        ));
170    }
171    data.sort_by(|x, y| x.partial_cmp(y).unwrap_or(std::cmp::Ordering::Equal));
172
173    let mut best_val = data[0];
174    let mut best_count: u64 = 1;
175    let mut cur_val = data[0];
176    let mut cur_count: u64 = 1;
177    for &v in &data[1..] {
178        if v.partial_cmp(&cur_val) == Some(std::cmp::Ordering::Equal) {
179            cur_count += 1;
180        } else {
181            if cur_count > best_count {
182                best_count = cur_count;
183                best_val = cur_val;
184            }
185            cur_val = v;
186            cur_count = 1;
187        }
188    }
189    if cur_count > best_count {
190        best_count = cur_count;
191        best_val = cur_val;
192    }
193    Ok(ModeResult {
194        value: best_val,
195        count: best_count,
196    })
197}
198
199/// Interquartile range: Q75 − Q25.
200///
201/// Equivalent to `scipy.stats.iqr` with the default linear
202/// interpolation method.
203///
204/// # Errors
205/// `FerrayError::InvalidValue` if the array is empty.
206pub fn iqr<T, D>(a: &Array<T, D>) -> FerrayResult<T>
207where
208    T: Element + Float,
209    D: Dimension,
210{
211    let q75 = crate::reductions::quantile::quantile(
212        a,
213        T::from(0.75).expect("0.75 must round-trip to T"),
214        None,
215    )?;
216    let q25 = crate::reductions::quantile::quantile(
217        a,
218        T::from(0.25).expect("0.25 must round-trip to T"),
219        None,
220    )?;
221    let q75v = *q75.as_slice().unwrap().first().unwrap();
222    let q25v = *q25.as_slice().unwrap().first().unwrap();
223    Ok(q75v - q25v)
224}
225
226/// Standard error of the mean: `std / sqrt(n)`.
227///
228/// Uses the sample standard deviation (denominator `n - 1`,
229/// matching scipy's `ddof=1` default).
230///
231/// # Errors
232/// `FerrayError::InvalidValue` if the array has fewer than 2
233/// elements.
234pub fn sem<T, D>(a: &Array<T, D>) -> FerrayResult<f64>
235where
236    T: Element + Copy + Into<f64>,
237    D: Dimension,
238{
239    let data: Vec<f64> = a.iter().copied().map(Into::into).collect();
240    let n = data.len();
241    if n < 2 {
242        return Err(FerrayError::invalid_value(
243            "sem requires at least 2 elements",
244        ));
245    }
246    let nf = n as f64;
247    let mean = data.iter().sum::<f64>() / nf;
248    let var = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (nf - 1.0);
249    Ok((var / nf).sqrt())
250}
251
252/// Geometric mean: `(prod x_i) ^ (1/n)`.
253///
254/// Uses log-space to avoid overflow on large `n`. All elements must
255/// be strictly positive.
256///
257/// # Errors
258/// `FerrayError::InvalidValue` if the array is empty or contains a
259/// non-positive value.
260pub fn gmean<T, D>(a: &Array<T, D>) -> FerrayResult<f64>
261where
262    T: Element + Copy + Into<f64>,
263    D: Dimension,
264{
265    let data: Vec<f64> = a.iter().copied().map(Into::into).collect();
266    if data.is_empty() {
267        return Err(FerrayError::invalid_value("gmean on empty array"));
268    }
269    let mut log_sum = 0.0_f64;
270    for &x in &data {
271        if x <= 0.0 {
272            return Err(FerrayError::invalid_value(
273                "gmean requires all elements > 0",
274            ));
275        }
276        log_sum += x.ln();
277    }
278    Ok((log_sum / data.len() as f64).exp())
279}
280
281/// Harmonic mean: `n / sum(1/x_i)`.
282///
283/// All elements must be strictly positive.
284///
285/// # Errors
286/// `FerrayError::InvalidValue` if the array is empty or contains a
287/// non-positive value.
288pub fn hmean<T, D>(a: &Array<T, D>) -> FerrayResult<f64>
289where
290    T: Element + Copy + Into<f64>,
291    D: Dimension,
292{
293    let data: Vec<f64> = a.iter().copied().map(Into::into).collect();
294    if data.is_empty() {
295        return Err(FerrayError::invalid_value("hmean on empty array"));
296    }
297    let mut recip_sum = 0.0_f64;
298    for &x in &data {
299        if x <= 0.0 {
300            return Err(FerrayError::invalid_value(
301                "hmean requires all elements > 0",
302            ));
303        }
304        recip_sum += 1.0 / x;
305    }
306    Ok(data.len() as f64 / recip_sum)
307}
308
309#[cfg(test)]
310mod tests {
311    use super::*;
312    use ferray_core::{Ix1, Ix2};
313
314    #[test]
315    fn skew_symmetric_zero() {
316        let a =
317            Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![-2.0, -1.0, 0.0, 1.0, 2.0]).unwrap();
318        assert!(skew(&a).unwrap().abs() < 1e-12);
319    }
320
321    #[test]
322    fn skew_right_skewed_positive() {
323        let a = Array::<f64, Ix1>::from_vec(Ix1::new([6]), vec![1.0, 1.0, 1.0, 1.0, 1.0, 10.0])
324            .unwrap();
325        assert!(skew(&a).unwrap() > 0.0);
326    }
327
328    #[test]
329    fn kurtosis_normal_fisher_near_zero() {
330        // Symmetric narrow distribution: kurtosis (Fisher) close to 0
331        // is too strong a guarantee for n=5, but Fisher version of an
332        // arithmetic progression around 0 is a fixed value (-1.3) so
333        // we check for the known answer.
334        let a =
335            Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![-2.0, -1.0, 0.0, 1.0, 2.0]).unwrap();
336        let k = kurtosis(&a, true).unwrap();
337        assert!((k - (-1.3)).abs() < 1e-12, "Fisher kurtosis = {k}");
338    }
339
340    #[test]
341    fn kurtosis_pearson_is_fisher_plus_3() {
342        let a =
343            Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![-2.0, -1.0, 0.0, 1.0, 2.0]).unwrap();
344        let f = kurtosis(&a, true).unwrap();
345        let p = kurtosis(&a, false).unwrap();
346        assert!((p - (f + 3.0)).abs() < 1e-12);
347    }
348
349    #[test]
350    fn zscore_has_mean_zero_std_one() {
351        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
352        let z = zscore(&a).unwrap();
353        let s = z.as_slice().unwrap();
354        let n = s.len() as f64;
355        let mean: f64 = s.iter().sum::<f64>() / n;
356        let std = (s.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n).sqrt();
357        assert!(mean.abs() < 1e-12);
358        assert!((std - 1.0).abs() < 1e-12);
359    }
360
361    #[test]
362    fn zscore_constant_errors() {
363        let a = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![3.0; 4]).unwrap();
364        assert!(zscore(&a).is_err());
365    }
366
367    #[test]
368    fn zscore_2d_preserves_shape() {
369        let a = Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
370            .unwrap();
371        let z = zscore(&a).unwrap();
372        assert_eq!(z.shape(), &[2, 3]);
373    }
374
375    #[test]
376    fn mode_picks_most_frequent() {
377        let a = Array::<i32, Ix1>::from_vec(Ix1::new([7]), vec![1, 2, 2, 3, 3, 3, 4]).unwrap();
378        let m = mode(&a).unwrap();
379        assert_eq!(m.value, 3);
380        assert_eq!(m.count, 3);
381    }
382
383    #[test]
384    fn mode_tiebreak_smallest_wins() {
385        let a = Array::<i32, Ix1>::from_vec(Ix1::new([6]), vec![5, 5, 7, 7, 1, 1]).unwrap();
386        let m = mode(&a).unwrap();
387        assert_eq!(m.value, 1);
388        assert_eq!(m.count, 2);
389    }
390
391    #[test]
392    fn iqr_50_percent_span() {
393        // Data 1..9: Q25 = 2.75, Q75 = 6.25 → IQR = 3.5 (numpy default).
394        let a = Array::<f64, Ix1>::from_vec(
395            Ix1::new([8]),
396            vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
397        )
398        .unwrap();
399        let v = iqr(&a).unwrap();
400        assert!((v - 3.5).abs() < 1e-12, "IQR = {v}");
401    }
402
403    #[test]
404    fn sem_constant_data_is_zero() {
405        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![3.0; 5]).unwrap();
406        assert!(sem(&a).unwrap().abs() < 1e-12);
407    }
408
409    #[test]
410    fn sem_known_value() {
411        // [1, 2, 3, 4, 5]: mean=3, ddof=1 var = 10/4 = 2.5,
412        // std = sqrt(2.5), sem = std / sqrt(5) ≈ 0.7071...
413        let a = Array::<f64, Ix1>::from_vec(Ix1::new([5]), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
414        let s = sem(&a).unwrap();
415        let want = (2.5_f64 / 5.0).sqrt();
416        assert!((s - want).abs() < 1e-12, "sem = {s}, want {want}");
417    }
418
419    #[test]
420    fn gmean_known_value() {
421        // gmean(1, 2, 4, 8) = (1*2*4*8)^(1/4) = 64^0.25 = 2*sqrt(2)
422        let a = Array::<f64, Ix1>::from_vec(Ix1::new([4]), vec![1.0, 2.0, 4.0, 8.0]).unwrap();
423        let g = gmean(&a).unwrap();
424        let want = 2.0_f64.sqrt() * 2.0;
425        assert!((g - want).abs() < 1e-12, "gmean = {g}, want {want}");
426    }
427
428    #[test]
429    fn gmean_rejects_nonpositive() {
430        let a = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![1.0, 0.0, 4.0]).unwrap();
431        assert!(gmean(&a).is_err());
432    }
433
434    #[test]
435    fn hmean_known_value() {
436        // hmean(1, 2, 4) = 3 / (1 + 0.5 + 0.25) = 3 / 1.75 = 12/7
437        let a = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![1.0, 2.0, 4.0]).unwrap();
438        let h = hmean(&a).unwrap();
439        assert!((h - 12.0 / 7.0).abs() < 1e-12, "hmean = {h}");
440    }
441
442    #[test]
443    fn hmean_rejects_nonpositive() {
444        let a = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![1.0, -1.0, 2.0]).unwrap();
445        assert!(hmean(&a).is_err());
446    }
447
448    #[test]
449    fn empty_array_errors_across_helpers() {
450        let a = Array::<f64, Ix1>::from_vec(Ix1::new([0]), Vec::new()).unwrap();
451        assert!(skew(&a).is_err());
452        assert!(kurtosis(&a, true).is_err());
453        assert!(zscore(&a).is_err());
454        assert!(mode(&a).is_err());
455        assert!(gmean(&a).is_err());
456        assert!(hmean(&a).is_err());
457        assert!(sem(&a).is_err());
458    }
459}