ferray_stats/
hypothesis.rs

1// ferray-stats: scipy.stats statistical-test functions (#724).
2//
3// Provides pearsonr, spearmanr, ttest_1samp, ttest_ind, ks_2samp.
4// p-values use the regularised incomplete beta function (Lentz's
5// continued fraction) for the t-distribution CDF and the
6// closed-form Kolmogorov-Smirnov series for ks_2samp. chi2 p-values
7// (needing incomplete gamma) are tracked separately as #743.
8
9use ferray_core::error::{FerrayError, FerrayResult};
10use ferray_core::{Array, Dimension, Element, Ix2};
11
12// ---------------------------------------------------------------------------
13// Regularised incomplete beta function I_x(a, b)
14//
15// Numerically stable evaluation via Lentz's continued fraction. Used
16// for the t-distribution CDF when computing two-sided p-values.
17// ---------------------------------------------------------------------------
18
19/// `I_x(a, b)` = regularised incomplete beta function. Returns NaN
20/// for inputs outside the valid domain.
21fn betainc(a: f64, b: f64, x: f64) -> f64 {
22    if !(0.0..=1.0).contains(&x) || a <= 0.0 || b <= 0.0 {
23        return f64::NAN;
24    }
25    if x == 0.0 {
26        return 0.0;
27    }
28    if x == 1.0 {
29        return 1.0;
30    }
31    // Numerical-Recipes recipe: choose the side of the symmetry
32    // identity that gives faster CF convergence.
33    let bt = (ln_gamma(a + b) - ln_gamma(a) - ln_gamma(b) + a * x.ln() + b * (1.0 - x).ln()).exp();
34    if x < (a + 1.0) / (a + b + 2.0) {
35        bt * betacf(a, b, x) / a
36    } else {
37        1.0 - bt * betacf(b, a, 1.0 - x) / b
38    }
39}
40
41/// Stirling-approximation log-gamma. Accurate to ~1e-14 for x >= 1.
42fn ln_gamma(x: f64) -> f64 {
43    // Lanczos approximation with g = 5 (Numerical Recipes coefficients).
44    const G: f64 = 5.0;
45    const COEFFS: [f64; 7] = [
46        1.000_000_000_190_015,
47        76.180_091_729_471_46,
48        -86.505_320_329_416_77,
49        24.014_098_240_830_91,
50        -1.231_739_572_450_155,
51        0.001_208_650_973_866_179,
52        -0.000_005_395_239_384_953,
53    ];
54    let mut sum = COEFFS[0];
55    for (i, &c) in COEFFS.iter().enumerate().skip(1) {
56        sum += c / (x + i as f64);
57    }
58    let half_log_2pi = 0.918_938_533_204_672_8;
59    half_log_2pi + (x + 0.5) * (x + G + 0.5).ln() - (x + G + 0.5) + (sum / x).ln()
60}
61
62/// Continued-fraction tail used by `betainc`. Lentz's algorithm.
63fn betacf(a: f64, b: f64, x: f64) -> f64 {
64    const MAX_ITER: usize = 200;
65    const EPS: f64 = 3e-16;
66    let qab = a + b;
67    let qap = a + 1.0;
68    let qam = a - 1.0;
69    let mut c = 1.0;
70    let mut d = 1.0 - qab * x / qap;
71    if d.abs() < 1e-30 {
72        d = 1e-30;
73    }
74    d = 1.0 / d;
75    let mut h = d;
76    for m in 1..=MAX_ITER {
77        let mf = m as f64;
78        let m2 = 2.0 * mf;
79        // Even step.
80        let aa = mf * (b - mf) * x / ((qam + m2) * (a + m2));
81        d = 1.0 + aa * d;
82        if d.abs() < 1e-30 {
83            d = 1e-30;
84        }
85        c = 1.0 + aa / c;
86        if c.abs() < 1e-30 {
87            c = 1e-30;
88        }
89        d = 1.0 / d;
90        h *= d * c;
91        // Odd step.
92        let aa = -(a + mf) * (qab + mf) * x / ((a + m2) * (qap + m2));
93        d = 1.0 + aa * d;
94        if d.abs() < 1e-30 {
95            d = 1e-30;
96        }
97        c = 1.0 + aa / c;
98        if c.abs() < 1e-30 {
99            c = 1e-30;
100        }
101        d = 1.0 / d;
102        let del = d * c;
103        h *= del;
104        if (del - 1.0).abs() < EPS {
105            return h;
106        }
107    }
108    h
109}
110
111/// Two-sided p-value for a t-statistic with the given degrees of
112/// freedom. Computed via the t-distribution CDF expressed in terms
113/// of the regularised incomplete beta function.
114fn t_two_sided_p(t: f64, df: f64) -> f64 {
115    if !t.is_finite() || df <= 0.0 {
116        return f64::NAN;
117    }
118    betainc(df / 2.0, 0.5, df / (df + t * t))
119}
120
121// ---------------------------------------------------------------------------
122// Public statistical-test functions
123// ---------------------------------------------------------------------------
124
125/// Pearson product-moment correlation coefficient and two-sided
126/// p-value (#724).
127///
128/// Equivalent to `scipy.stats.pearsonr(x, y)`. Returns `(r, p)`
129/// where `r` is the sample correlation coefficient and `p` is the
130/// two-sided p-value under the null hypothesis that the two
131/// variables are uncorrelated. The p-value uses the t-distribution
132/// with `n - 2` degrees of freedom, expressed in terms of the
133/// regularised incomplete beta function.
134///
135/// # Errors
136/// `FerrayError::InvalidValue` if either array has fewer than 2
137/// elements or the lengths differ.
138pub fn pearsonr<T, D>(x: &Array<T, D>, y: &Array<T, D>) -> FerrayResult<(f64, f64)>
139where
140    T: Element + Copy + Into<f64>,
141    D: Dimension,
142{
143    let xs: Vec<f64> = x.iter().copied().map(Into::into).collect();
144    let ys: Vec<f64> = y.iter().copied().map(Into::into).collect();
145    if xs.len() != ys.len() {
146        return Err(FerrayError::shape_mismatch(format!(
147            "pearsonr: arrays have different lengths ({} vs {})",
148            xs.len(),
149            ys.len()
150        )));
151    }
152    let n = xs.len();
153    if n < 2 {
154        return Err(FerrayError::invalid_value(
155            "pearsonr: at least 2 observations required",
156        ));
157    }
158    let nf = n as f64;
159    let mx = xs.iter().sum::<f64>() / nf;
160    let my = ys.iter().sum::<f64>() / nf;
161    let mut sxx = 0.0_f64;
162    let mut syy = 0.0_f64;
163    let mut sxy = 0.0_f64;
164    for (a, b) in xs.iter().zip(ys.iter()) {
165        let dx = a - mx;
166        let dy = b - my;
167        sxx += dx * dx;
168        syy += dy * dy;
169        sxy += dx * dy;
170    }
171    let denom = (sxx * syy).sqrt();
172    if denom == 0.0 {
173        // Either x or y is constant — Pearson is undefined.
174        return Ok((f64::NAN, f64::NAN));
175    }
176    let r = (sxy / denom).clamp(-1.0, 1.0);
177    let df = nf - 2.0;
178    let t = r * (df / (1.0 - r * r).max(f64::MIN_POSITIVE)).sqrt();
179    let p = t_two_sided_p(t, df);
180    Ok((r, p))
181}
182
183/// Spearman rank correlation coefficient and two-sided p-value (#724).
184///
185/// Equivalent to `scipy.stats.spearmanr(x, y)`. Computes Pearson
186/// correlation on the ranks of the inputs (with average ranks for
187/// ties). Returns `(rho, p)` using the same t-distribution p-value
188/// approximation as Pearson with `df = n - 2`.
189///
190/// # Errors
191/// Same as `pearsonr`.
192pub fn spearmanr<T, D>(x: &Array<T, D>, y: &Array<T, D>) -> FerrayResult<(f64, f64)>
193where
194    T: Element + Copy + PartialOrd + Into<f64>,
195    D: Dimension,
196{
197    let xs: Vec<f64> = x.iter().copied().map(Into::into).collect();
198    let ys: Vec<f64> = y.iter().copied().map(Into::into).collect();
199    if xs.len() != ys.len() {
200        return Err(FerrayError::shape_mismatch(format!(
201            "spearmanr: arrays have different lengths ({} vs {})",
202            xs.len(),
203            ys.len()
204        )));
205    }
206    if xs.len() < 2 {
207        return Err(FerrayError::invalid_value(
208            "spearmanr: at least 2 observations required",
209        ));
210    }
211    let rx = average_ranks(&xs);
212    let ry = average_ranks(&ys);
213    pearsonr_f64_slices(&rx, &ry)
214}
215
216/// Pearson correlation on already-prepared f64 slices (shared
217/// helper for spearmanr).
218fn pearsonr_f64_slices(xs: &[f64], ys: &[f64]) -> FerrayResult<(f64, f64)> {
219    let n = xs.len();
220    if n < 2 {
221        return Err(FerrayError::invalid_value(
222            "pearsonr: at least 2 observations required",
223        ));
224    }
225    let nf = n as f64;
226    let mx = xs.iter().sum::<f64>() / nf;
227    let my = ys.iter().sum::<f64>() / nf;
228    let mut sxx = 0.0_f64;
229    let mut syy = 0.0_f64;
230    let mut sxy = 0.0_f64;
231    for (a, b) in xs.iter().zip(ys.iter()) {
232        let dx = a - mx;
233        let dy = b - my;
234        sxx += dx * dx;
235        syy += dy * dy;
236        sxy += dx * dy;
237    }
238    let denom = (sxx * syy).sqrt();
239    if denom == 0.0 {
240        return Ok((f64::NAN, f64::NAN));
241    }
242    let r = (sxy / denom).clamp(-1.0, 1.0);
243    let df = nf - 2.0;
244    let t = r * (df / (1.0 - r * r).max(f64::MIN_POSITIVE)).sqrt();
245    let p = t_two_sided_p(t, df);
246    Ok((r, p))
247}
248
249/// Compute average ranks for ties.
250fn average_ranks(xs: &[f64]) -> Vec<f64> {
251    let n = xs.len();
252    let mut idx: Vec<usize> = (0..n).collect();
253    idx.sort_by(|&a, &b| {
254        xs[a]
255            .partial_cmp(&xs[b])
256            .unwrap_or(std::cmp::Ordering::Equal)
257    });
258    let mut ranks = vec![0.0_f64; n];
259    let mut i = 0;
260    while i < n {
261        let mut j = i;
262        while j + 1 < n && xs[idx[j + 1]] == xs[idx[i]] {
263            j += 1;
264        }
265        // Average rank for the tie group i..=j (1-based).
266        let avg = (i as f64 + j as f64) / 2.0 + 1.0;
267        for k in i..=j {
268            ranks[idx[k]] = avg;
269        }
270        i = j + 1;
271    }
272    ranks
273}
274
275/// One-sample Student's t-test (#724).
276///
277/// Equivalent to `scipy.stats.ttest_1samp(a, popmean)`. Returns
278/// `(t, p)` where `t = (mean(a) - popmean) / (s / sqrt(n))` with
279/// `s` the sample standard deviation (ddof=1) and `p` the
280/// two-sided p-value with `n - 1` degrees of freedom.
281///
282/// # Errors
283/// `FerrayError::InvalidValue` if `a` has fewer than 2 elements.
284pub fn ttest_1samp<T, D>(a: &Array<T, D>, popmean: f64) -> FerrayResult<(f64, f64)>
285where
286    T: Element + Copy + Into<f64>,
287    D: Dimension,
288{
289    let xs: Vec<f64> = a.iter().copied().map(Into::into).collect();
290    let n = xs.len();
291    if n < 2 {
292        return Err(FerrayError::invalid_value(
293            "ttest_1samp: at least 2 observations required",
294        ));
295    }
296    let nf = n as f64;
297    let mean = xs.iter().sum::<f64>() / nf;
298    let var = xs.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (nf - 1.0);
299    if var == 0.0 {
300        return Ok((f64::NAN, f64::NAN));
301    }
302    let std = var.sqrt();
303    let se = std / nf.sqrt();
304    let t = (mean - popmean) / se;
305    let p = t_two_sided_p(t, nf - 1.0);
306    Ok((t, p))
307}
308
309/// Independent (Welch's) two-sample t-test (#724).
310///
311/// Equivalent to `scipy.stats.ttest_ind(a, b, equal_var=False)`.
312/// Returns `(t, p)`. The Welch–Satterthwaite degrees-of-freedom
313/// approximation is used for unequal variances.
314///
315/// # Errors
316/// `FerrayError::InvalidValue` if either sample has fewer than 2
317/// elements.
318pub fn ttest_ind<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<(f64, f64)>
319where
320    T: Element + Copy + Into<f64>,
321    D: Dimension,
322{
323    let xs: Vec<f64> = a.iter().copied().map(Into::into).collect();
324    let ys: Vec<f64> = b.iter().copied().map(Into::into).collect();
325    if xs.len() < 2 || ys.len() < 2 {
326        return Err(FerrayError::invalid_value(
327            "ttest_ind: each sample needs at least 2 observations",
328        ));
329    }
330    let n1 = xs.len() as f64;
331    let n2 = ys.len() as f64;
332    let m1 = xs.iter().sum::<f64>() / n1;
333    let m2 = ys.iter().sum::<f64>() / n2;
334    let v1 = xs.iter().map(|x| (x - m1).powi(2)).sum::<f64>() / (n1 - 1.0);
335    let v2 = ys.iter().map(|x| (x - m2).powi(2)).sum::<f64>() / (n2 - 1.0);
336    if v1 == 0.0 && v2 == 0.0 {
337        return Ok((f64::NAN, f64::NAN));
338    }
339    let se = (v1 / n1 + v2 / n2).sqrt();
340    let t = (m1 - m2) / se;
341    let num = (v1 / n1 + v2 / n2).powi(2);
342    let denom = (v1 / n1).powi(2) / (n1 - 1.0) + (v2 / n2).powi(2) / (n2 - 1.0);
343    let df = num / denom;
344    let p = t_two_sided_p(t, df);
345    Ok((t, p))
346}
347
348/// Result of [`chi2_contingency`].
349#[derive(Debug, Clone)]
350pub struct Chi2ContingencyResult {
351    /// The chi-squared test statistic.
352    pub statistic: f64,
353    /// Two-sided p-value under H_0 of independence.
354    pub p_value: f64,
355    /// Degrees of freedom.
356    pub dof: usize,
357    /// Expected frequencies under independence (same shape as input).
358    pub expected: Array<f64, Ix2>,
359}
360
361/// Chi-squared test of independence on a 2-D contingency table (#743).
362///
363/// Equivalent to `scipy.stats.chi2_contingency(observed)` (without
364/// Yates' correction). Returns the test statistic, p-value, degrees
365/// of freedom, and expected frequencies. The p-value uses the
366/// regularised incomplete gamma function for the chi-squared CDF.
367///
368/// # Errors
369/// `FerrayError::InvalidValue` if the table is empty or any row /
370/// column total is zero (degenerate independence model).
371pub fn chi2_contingency<T>(observed: &Array<T, Ix2>) -> FerrayResult<Chi2ContingencyResult>
372where
373    T: Element + Copy + Into<f64>,
374{
375    let shape = observed.shape();
376    let nr = shape[0];
377    let nc = shape[1];
378    if nr == 0 || nc == 0 {
379        return Err(FerrayError::invalid_value(
380            "chi2_contingency: table must be non-empty along both axes",
381        ));
382    }
383    let data: Vec<f64> = observed.iter().copied().map(Into::into).collect();
384    // Row sums, column sums, and grand total.
385    let mut row_sums = vec![0.0_f64; nr];
386    let mut col_sums = vec![0.0_f64; nc];
387    let mut total = 0.0_f64;
388    for i in 0..nr {
389        for j in 0..nc {
390            let v = data[i * nc + j];
391            row_sums[i] += v;
392            col_sums[j] += v;
393            total += v;
394        }
395    }
396    if total == 0.0 {
397        return Err(FerrayError::invalid_value(
398            "chi2_contingency: total frequency is zero",
399        ));
400    }
401    if row_sums.contains(&0.0) || col_sums.contains(&0.0) {
402        return Err(FerrayError::invalid_value(
403            "chi2_contingency: a row or column has zero total — independence model is degenerate",
404        ));
405    }
406    let mut expected = vec![0.0_f64; nr * nc];
407    let mut chi2 = 0.0_f64;
408    for i in 0..nr {
409        for j in 0..nc {
410            let e = row_sums[i] * col_sums[j] / total;
411            expected[i * nc + j] = e;
412            let o = data[i * nc + j];
413            // Standard chi-squared term; e is guaranteed > 0 above.
414            let d = o - e;
415            chi2 += d * d / e;
416        }
417    }
418    let dof = (nr - 1) * (nc - 1);
419    let p = chi2_sf(chi2, dof as f64);
420    Ok(Chi2ContingencyResult {
421        statistic: chi2,
422        p_value: p,
423        dof,
424        expected: Array::<f64, Ix2>::from_vec(Ix2::new([nr, nc]), expected)?,
425    })
426}
427
428/// Survival function (1 - CDF) of the chi-squared distribution
429/// with `df` degrees of freedom evaluated at `x`. Uses the
430/// regularised upper incomplete gamma function `Q(df/2, x/2)`.
431fn chi2_sf(x: f64, df: f64) -> f64 {
432    if !x.is_finite() || x < 0.0 || df <= 0.0 {
433        return f64::NAN;
434    }
435    if x == 0.0 {
436        return 1.0;
437    }
438    gamma_q(df / 2.0, x / 2.0)
439}
440
441/// Regularised upper incomplete gamma `Q(a, x) = 1 - P(a, x)`.
442///
443/// Uses the series for `x < a + 1` and the continued fraction for
444/// `x >= a + 1` (Numerical Recipes recipe).
445fn gamma_q(a: f64, x: f64) -> f64 {
446    if a <= 0.0 || x < 0.0 || !x.is_finite() {
447        return f64::NAN;
448    }
449    if x == 0.0 {
450        return 1.0;
451    }
452    if x < a + 1.0 {
453        1.0 - gamma_p_series(a, x)
454    } else {
455        gamma_q_cf(a, x)
456    }
457}
458
459/// Series expansion of `P(a, x)` valid for `x < a + 1`.
460fn gamma_p_series(a: f64, x: f64) -> f64 {
461    const MAX_ITER: usize = 200;
462    const EPS: f64 = 3e-16;
463    let mut ap = a;
464    let mut sum = 1.0_f64 / a;
465    let mut term = sum;
466    for _ in 0..MAX_ITER {
467        ap += 1.0;
468        term *= x / ap;
469        sum += term;
470        if term.abs() < sum.abs() * EPS {
471            break;
472        }
473    }
474    sum * (-x + a * x.ln() - ln_gamma(a)).exp()
475}
476
477/// Continued-fraction expansion of `Q(a, x)` valid for `x >= a + 1`.
478/// Lentz's algorithm.
479fn gamma_q_cf(a: f64, x: f64) -> f64 {
480    const MAX_ITER: usize = 200;
481    const EPS: f64 = 3e-16;
482    let mut b = x + 1.0 - a;
483    let mut c = 1.0 / 1e-30;
484    let mut d = 1.0 / b;
485    let mut h = d;
486    for i in 1..=MAX_ITER {
487        let an = -(i as f64) * (i as f64 - a);
488        b += 2.0;
489        d = an * d + b;
490        if d.abs() < 1e-30 {
491            d = 1e-30;
492        }
493        c = b + an / c;
494        if c.abs() < 1e-30 {
495            c = 1e-30;
496        }
497        d = 1.0 / d;
498        let del = d * c;
499        h *= del;
500        if (del - 1.0).abs() < EPS {
501            break;
502        }
503    }
504    h * (-x + a * x.ln() - ln_gamma(a)).exp()
505}
506
507/// Two-sample Kolmogorov–Smirnov test (#724).
508///
509/// Equivalent to `scipy.stats.ks_2samp(a, b)`. Returns
510/// `(D, p)` where `D` is the maximum absolute difference between
511/// the two empirical CDFs and `p` is the two-sided asymptotic
512/// p-value via the closed-form Smirnov series.
513///
514/// # Errors
515/// `FerrayError::InvalidValue` if either sample is empty.
516pub fn ks_2samp<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<(f64, f64)>
517where
518    T: Element + Copy + PartialOrd + Into<f64>,
519    D: Dimension,
520{
521    let mut xs: Vec<f64> = a.iter().copied().map(Into::into).collect();
522    let mut ys: Vec<f64> = b.iter().copied().map(Into::into).collect();
523    if xs.is_empty() || ys.is_empty() {
524        return Err(FerrayError::invalid_value(
525            "ks_2samp: both samples must be non-empty",
526        ));
527    }
528    xs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
529    ys.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
530
531    // KS statistic: max over the union of sample points of
532    // |F_n1(x) - F_n2(x)|. Computed by evaluating each empirical
533    // CDF at every union point via partition_point (== numpy
534    // searchsorted side='right'). Matches scipy.stats.ks_2samp.
535    let n1 = xs.len() as f64;
536    let n2 = ys.len() as f64;
537    let mut union: Vec<f64> = Vec::with_capacity(xs.len() + ys.len());
538    union.extend_from_slice(&xs);
539    union.extend_from_slice(&ys);
540    union.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
541    let mut d = 0.0_f64;
542    for &v in &union {
543        let i = xs.partition_point(|x| *x <= v);
544        let j = ys.partition_point(|y| *y <= v);
545        let here = (i as f64 / n1 - j as f64 / n2).abs();
546        if here > d {
547            d = here;
548        }
549    }
550
551    // Asymptotic Kolmogorov distribution for the p-value.
552    let en = (n1 * n2 / (n1 + n2)).sqrt();
553    let lambda = (en + 0.12 + 0.11 / en) * d;
554    let p = ks_p_kolmogorov(lambda).clamp(0.0, 1.0);
555    Ok((d, p))
556}
557
558/// Kolmogorov asymptotic survival series:
559/// Q(λ) = 2 Σ_{k=1..∞} (-1)^(k-1) exp(-2 k² λ²).
560fn ks_p_kolmogorov(lambda: f64) -> f64 {
561    if lambda <= 0.0 {
562        return 1.0;
563    }
564    let mut sum = 0.0_f64;
565    let mut sign = 1.0_f64;
566    let mut prev = 0.0_f64;
567    for k in 1..=200 {
568        let kf = k as f64;
569        let term = sign * (-2.0 * kf * kf * lambda * lambda).exp();
570        sum += term;
571        if (sum - prev).abs() < 1e-15 {
572            break;
573        }
574        prev = sum;
575        sign = -sign;
576    }
577    2.0 * sum
578}
579
580#[cfg(test)]
581mod tests {
582    use super::*;
583    use ferray_core::Ix1;
584
585    fn arr1(v: Vec<f64>) -> Array<f64, Ix1> {
586        let n = v.len();
587        Array::from_vec(Ix1::new([n]), v).unwrap()
588    }
589
590    #[test]
591    fn pearsonr_perfect_positive() {
592        let x = arr1(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
593        let y = arr1(vec![2.0, 4.0, 6.0, 8.0, 10.0]);
594        let (r, p) = pearsonr(&x, &y).unwrap();
595        assert!((r - 1.0).abs() < 1e-12);
596        // Perfect correlation → p effectively 0.
597        assert!(p < 1e-6);
598    }
599
600    #[test]
601    fn pearsonr_perfect_negative() {
602        let x = arr1(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
603        let y = arr1(vec![5.0, 4.0, 3.0, 2.0, 1.0]);
604        let (r, p) = pearsonr(&x, &y).unwrap();
605        assert!((r + 1.0).abs() < 1e-12);
606        assert!(p < 1e-6);
607    }
608
609    #[test]
610    fn pearsonr_uncorrelated_high_p() {
611        // Random-ish numbers with no linear relationship → r small, p large.
612        let x = arr1(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
613        let y = arr1(vec![7.5, -1.0, 2.0, 5.5, -3.0, 6.0, 0.5, 1.5]);
614        let (r, p) = pearsonr(&x, &y).unwrap();
615        assert!(r.abs() < 1.0);
616        assert!(p > 0.0 && p <= 1.0);
617    }
618
619    #[test]
620    fn pearsonr_constant_returns_nan() {
621        let x = arr1(vec![3.0, 3.0, 3.0, 3.0]);
622        let y = arr1(vec![1.0, 2.0, 3.0, 4.0]);
623        let (r, p) = pearsonr(&x, &y).unwrap();
624        assert!(r.is_nan());
625        assert!(p.is_nan());
626    }
627
628    #[test]
629    fn pearsonr_length_mismatch_errors() {
630        let x = arr1(vec![1.0, 2.0]);
631        let y = arr1(vec![1.0, 2.0, 3.0]);
632        assert!(pearsonr(&x, &y).is_err());
633    }
634
635    #[test]
636    fn spearmanr_monotonic_nonlinear_is_one() {
637        // y is a monotonic increasing nonlinear function of x.
638        let x = arr1(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
639        let y = arr1(vec![1.0, 4.0, 9.0, 16.0, 25.0]);
640        let (rho, _) = spearmanr(&x, &y).unwrap();
641        assert!((rho - 1.0).abs() < 1e-12);
642    }
643
644    #[test]
645    fn ttest_1samp_known() {
646        // Under H_0: μ = 5, sample [4, 5, 6] has mean 5, var 1, std 1.
647        // SE = 1/√3, t = 0.
648        let a = arr1(vec![4.0, 5.0, 6.0]);
649        let (t, p) = ttest_1samp(&a, 5.0).unwrap();
650        assert!(t.abs() < 1e-12);
651        // Two-sided p for t = 0 is exactly 1.
652        assert!((p - 1.0).abs() < 1e-12);
653    }
654
655    #[test]
656    fn ttest_1samp_strong_signal() {
657        // Sample very different from popmean — p should be tiny.
658        let a = arr1(vec![10.0, 11.0, 9.0, 10.5, 9.5, 10.0]);
659        let (_, p) = ttest_1samp(&a, 0.0).unwrap();
660        assert!(p < 1e-6);
661    }
662
663    #[test]
664    fn ttest_ind_identical_samples_t_zero() {
665        let a = arr1(vec![1.0, 2.0, 3.0, 4.0]);
666        let b = arr1(vec![1.0, 2.0, 3.0, 4.0]);
667        let (t, p) = ttest_ind(&a, &b).unwrap();
668        assert!(t.abs() < 1e-12);
669        assert!((p - 1.0).abs() < 1e-12);
670    }
671
672    #[test]
673    fn ttest_ind_strong_difference() {
674        // Welch's t for [1..5] vs [10..14]: t = -9, df ≈ 8.
675        // p ≈ 1.7e-5 (closed-form via the incomplete-beta tail).
676        let a = arr1(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
677        let b = arr1(vec![10.0, 11.0, 12.0, 13.0, 14.0]);
678        let (_, p) = ttest_ind(&a, &b).unwrap();
679        assert!(p < 1e-3, "p too large: {p}");
680    }
681
682    #[test]
683    fn ks_2samp_identical_samples_d_zero() {
684        let a = arr1(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
685        let b = arr1(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
686        let (d, p) = ks_2samp(&a, &b).unwrap();
687        assert!(d < 1e-12);
688        assert!(p > 0.99);
689    }
690
691    #[test]
692    fn ks_2samp_clearly_different_distributions() {
693        let a = arr1(vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]);
694        let b = arr1(vec![
695            10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0,
696        ]);
697        let (d, p) = ks_2samp(&a, &b).unwrap();
698        // No overlap → D = 1.0.
699        assert!((d - 1.0).abs() < 1e-12);
700        assert!(p < 0.01);
701    }
702
703    #[test]
704    fn ks_2samp_partial_overlap() {
705        let a = arr1(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
706        let b = arr1(vec![3.0, 4.0, 5.0, 6.0, 7.0]);
707        let (d, _p) = ks_2samp(&a, &b).unwrap();
708        // Visual inspection: at x=2, F(x)=2/5=0.4, G(x)=0/5=0.0 → diff 0.4.
709        assert!(d > 0.0 && d < 1.0);
710    }
711
712    #[test]
713    fn empty_inputs_error() {
714        let empty = arr1(vec![]);
715        let one = arr1(vec![1.0]);
716        let two = arr1(vec![1.0, 2.0]);
717        assert!(pearsonr(&one, &one).is_err());
718        assert!(spearmanr(&one, &one).is_err());
719        assert!(ttest_1samp(&one, 0.0).is_err());
720        assert!(ttest_ind(&one, &two).is_err());
721        assert!(ks_2samp(&empty, &two).is_err());
722    }
723
724    // ---- chi2_contingency (#743) ---------------------------------------
725
726    #[test]
727    fn chi2_independent_table_yields_high_p() {
728        // 2x2 table where rows are proportional to columns —
729        // expected = observed, chi2 = 0, p = 1.
730        let obs = Array::<f64, ferray_core::Ix2>::from_vec(
731            ferray_core::Ix2::new([2, 2]),
732            vec![10.0, 10.0, 20.0, 20.0],
733        )
734        .unwrap();
735        let r = chi2_contingency(&obs).unwrap();
736        assert!(r.statistic.abs() < 1e-12);
737        assert!((r.p_value - 1.0).abs() < 1e-12);
738        assert_eq!(r.dof, 1);
739    }
740
741    #[test]
742    fn chi2_strong_dependence_low_p() {
743        // 2x2 table with strong association.
744        let obs = Array::<f64, ferray_core::Ix2>::from_vec(
745            ferray_core::Ix2::new([2, 2]),
746            vec![100.0, 0.0, 0.0, 100.0],
747        )
748        .unwrap();
749        let r = chi2_contingency(&obs).unwrap();
750        // Massive chi2, tiny p.
751        assert!(r.statistic > 100.0);
752        assert!(r.p_value < 1e-6);
753        assert_eq!(r.dof, 1);
754    }
755
756    #[test]
757    fn chi2_known_value_2x3() {
758        // Standard textbook 2x3 example.
759        // observed = [[10, 20, 30], [40, 50, 60]]
760        // total = 210, row sums = (60, 150), col sums = (50, 70, 90)
761        // expected[0,0] = 60*50/210 = 100/7 ≈ 14.286
762        // chi2 ≈ ~2.857 (computed from expected/observed table)
763        let obs = Array::<f64, ferray_core::Ix2>::from_vec(
764            ferray_core::Ix2::new([2, 3]),
765            vec![10.0, 20.0, 30.0, 40.0, 50.0, 60.0],
766        )
767        .unwrap();
768        let r = chi2_contingency(&obs).unwrap();
769        // Expected: 100/7, 20/3 (=140/21=20/3 wrong; let me just verify dof + p in range)
770        assert_eq!(r.dof, 2);
771        assert!(r.p_value > 0.0 && r.p_value <= 1.0);
772        // First expected cell = 60 * 50 / 210 = 14.2857...
773        let e00 = r.expected.as_slice().unwrap()[0];
774        assert!((e00 - 60.0 * 50.0 / 210.0).abs() < 1e-10);
775    }
776
777    #[test]
778    fn chi2_zero_row_total_errors() {
779        let obs = Array::<f64, ferray_core::Ix2>::from_vec(
780            ferray_core::Ix2::new([2, 2]),
781            vec![0.0, 0.0, 1.0, 1.0],
782        )
783        .unwrap();
784        assert!(chi2_contingency(&obs).is_err());
785    }
786
787    #[test]
788    fn chi2_empty_table_errors() {
789        let obs = Array::<f64, ferray_core::Ix2>::from_vec(ferray_core::Ix2::new([0, 0]), vec![])
790            .unwrap();
791        assert!(chi2_contingency(&obs).is_err());
792    }
793
794    #[test]
795    fn gamma_q_known_values() {
796        // Q(1, x) = exp(-x)
797        assert!((gamma_q(1.0, 1.0) - (-1.0_f64).exp()).abs() < 1e-12);
798        assert!((gamma_q(1.0, 2.0) - (-2.0_f64).exp()).abs() < 1e-12);
799        // Q(a, 0) = 1
800        assert!((gamma_q(2.5, 0.0) - 1.0).abs() < 1e-12);
801    }
802
803    #[test]
804    fn betainc_known_values() {
805        // I_{0.5}(1, 1) = 0.5
806        assert!((betainc(1.0, 1.0, 0.5) - 0.5).abs() < 1e-10);
807        // I_0 = 0, I_1 = 1
808        assert_eq!(betainc(2.0, 3.0, 0.0), 0.0);
809        assert_eq!(betainc(2.0, 3.0, 1.0), 1.0);
810    }
811}
ferray_stats/hypothesis.rs

ferray_stats/
hypothesis.rs